Halide 16.0.0
Halide compiler and libraries
Loading...
Searching...
No Matches
State.h
Go to the documentation of this file.
1#ifndef STATE_H
2#define STATE_H
3
4#include "ASLog.h"
5#include "CostModel.h"
6#include "DefaultCostModel.h"
7#include "Featurization.h"
8#include "FunctionDAG.h"
9#include "LoopNest.h"
10#include "PerfectHashMap.h"
11#include <set>
12#include <unordered_set>
13#include <vector>
14
15namespace Halide {
16namespace Internal {
17namespace Autoscheduler {
18
19using std::map;
20using std::pair;
21using std::set;
22using std::string;
23using std::unordered_set;
24using std::vector;
25
27
29
31
32constexpr int kLocalMemoryLimit = 524288; // 512 KB
33
34// Stack memory limit = Total GPU Memory / (# of SMs * maximum threads per SM)
35// = 103232 bytes
36// Not all 103232 bytes will be free for allocations so reduce it by factor to
37// allow a buffer
39
41
43
46 }
47};
48
49template<typename PostCreateMutator>
61
62template<typename PostCreateMutator>
68
69struct State {
70 mutable RefCount ref_count;
73 double cost = 0;
74 std::vector<double> cost_per_stage;
76 int num_decisions_made = 0;
77 bool penalized = false;
78 string schedule_source;
79
80 State() = default;
81 State(const State &) = delete;
82 State(State &&) = delete;
83 void operator=(const State &) = delete;
84 void operator=(State &&) = delete;
85
86 uint64_t structural_hash(int depth) const;
87
88 // Compute the parent and depth of every loop nest node
90 const LoopNest *here, int depth) const;
91
93 const LoopNest *a, const LoopNest *b) const;
94
95 // We use the post_create_mutator so that the loop nests can be modified
96 // before they become IntrusivePtr<const LoopNest> as children and cannot be modified
97 template<typename PostCreateMutator>
103
105
107
111
113
114 // In phase 2, any compute_root loop marked 'none' will be split into
115 // blocks, threads, and serial loops. To enable the cost model to make a
116 // meaningful prediction on these pre-split loops, we assume a split into
117 // blocks and threads with a single full warp (if possible)
118 void split_compute_root_loops(LoopNest *loop_nest) const;
119
120 // If a loop nest does not have thread loops, split the outermost serial
121 // loops to create thread loops with extents 1
122 void add_outer_thread_loops(LoopNest *loop_nest) const;
123 };
124
126
128
129 bool compute_featurization(const FunctionDAG &dag, const Anderson2021Params &params, const Target &target, StageMap<ScheduleFeatures> *features, Statistics &stats, bool verbose = false) const;
130
131 void save_featurization(const FunctionDAG &dag, const Anderson2021Params &params, const Target &target, std::ostream &out) const;
132
134
135 // For GPU, only allow store_at root or inside the outermost loop nest. Any
136 // store_ats further in will be hoisted and expanded, increasing the
137 // amount of shared memory required.
139
141
142 bool exceeds_serial_extents_limit(const Target &target) const;
143
144 int64_t get_shared_mem_alloc_size(const LoopNest *block, const LoopNest *loop) const;
145
146 bool exceeds_shared_memory_limit(const Anderson2021Params &params, const Target &target) const;
147
148 bool exceeds_local_memory_limit(const Anderson2021Params &params, const Target &target) const;
149
150 bool calculate_cost(const FunctionDAG &dag, const Anderson2021Params &params, const Target &target, CostModel *cost_model, Statistics &stats, bool verbose = false);
151
152 // Make a child copy of this state. The loop nest is const (we
153 // make mutated copies of it, rather than mutating it), so we can
154 // continue to point to the same one and so this is a cheap
155 // operation.
157
158 void dump() const;
159
161
162 void fuse_gpu_blocks(LoopNest::StageScheduleState *state, Stage &stage, const vector<VarOrRVar> &parallel_vars, const vector<int64_t> &parallel_extents, const vector<int> &constant_extents) const;
163
164 void mark_gpu_blocks(LoopNest::StageScheduleState *state, Stage &stage, const vector<VarOrRVar> &parallel_vars, const vector<int64_t> &parallel_extents) const;
165
166 bool mark_gpu_threads(LoopNest::StageScheduleState *state, Stage &stage, std::unordered_set<std::string> &new_serial_vars, std::ostringstream &staged_funcs_schedule_source) const;
167
168 bool can_fuse_gpu(const vector<int64_t> &parallel_extents) const;
169
170 // Apply the schedule represented by this state to a Halide
171 // Pipeline. Also generate source code for the schedule for the
172 // user to copy-paste to freeze this schedule as permanent artifact.
173 void apply_schedule(const FunctionDAG &dag, const Anderson2021Params &params, const Target &target);
174
178
181};
182
183// A priority queue of states, sorted according to increasing
184// cost. Never shrinks, to avoid reallocations.
185// Can't use std::priority_queue because it doesn't support unique_ptr.
187private:
188 struct CompareStates {
189 bool operator()(const IntrusivePtr<State> &a, const IntrusivePtr<State> &b) const {
190 return a->cost > b->cost;
191 }
192 };
193
194 std::vector<IntrusivePtr<State>> storage;
195 size_t sz = 0;
196
197public:
199 if (sz >= storage.size()) {
200 storage.resize(std::max(sz * 2, (size_t)64));
201 }
202 internal_assert(sz < storage.size()) << sz << " " << storage.size() << "\n";
203 storage[sz] = std::move(s);
204 sz++;
205 std::push_heap(storage.begin(), storage.begin() + sz, CompareStates{});
206 }
207
209 internal_assert(sz <= storage.size()) << sz << " " << storage.size() << "\n";
210 std::pop_heap(storage.begin(), storage.begin() + sz, CompareStates{});
211 sz--;
212 return std::move(storage[sz]);
213 }
214
216 return storage[0];
217 }
218
219 bool empty() const {
220 return sz == 0;
221 }
222
223 size_t size() const {
224 return sz;
225 }
226
228 storage.swap(other.storage);
229 std::swap(sz, other.sz);
230 }
231
233 return storage[idx];
234 }
235
236 void resort() {
237 std::make_heap(storage.begin(), storage.begin() + sz, CompareStates{});
238 }
239
240 void clear() {
241 for (size_t i = 0; i < sz; i++) {
242 storage[i] = IntrusivePtr<State>{};
243 }
244 sz = 0;
245 }
246};
247
248} // namespace Autoscheduler
249} // namespace Internal
250} // namespace Halide
251
252#endif // STATE_H
#define internal_assert(c)
Definition Errors.h:19
const IntrusivePtr< State > & top()
Definition State.h:215
void emplace(IntrusivePtr< State > &&s)
Definition State.h:198
IntrusivePtr< State > operator[](int idx) const
Definition State.h:232
A class representing a reference count to be used with IntrusivePtr.
A single definition of a Func.
Definition Func.h:70
constexpr int kLocalMemoryLimit
Definition State.h:32
void deep_copy_loop_nest(LoopNest *new_loop_nest, const LoopNest *new_loop_nest_parent, const IntrusivePtr< const LoopNest > &existing_loop_nest, const PostCreateMutator &post_create_mutator)
Definition State.h:50
This file defines the class FunctionDAG, which is our representation of a Halide pipeline,...
@ Internal
Not visible externally, similar to 'static' linkage in C.
Expr cast(Expr a)
Cast an expression to the halide type corresponding to the C++ type T.
Definition IROperator.h:358
unsigned __INT64_TYPE__ uint64_t
signed __INT64_TYPE__ int64_t
void operator()(LoopNest *new_loop_nest) const
Definition State.h:45
void save_featurization(const FunctionDAG &dag, const Anderson2021Params &params, const Target &target, std::ostream &out) const
bool exceeds_serial_extents_limit(const Target &target) const
int64_t get_shared_mem_alloc_size(const LoopNest *block, const LoopNest *loop) const
void compute_loop_nest_parents(map< const LoopNest *, pair< const LoopNest *, int > > &p, const LoopNest *here, int depth) const
void update_always_consider_inline_options(const FunctionDAG::Node *node)
bool can_fuse_gpu(const vector< int64_t > &parallel_extents) const
bool should_always_consider_inline(const FunctionDAG::Node *node) const
bool contains_store_at_further_in_than_outermost() const
int64_t total_loop_extents_of_ancestors(const map< const LoopNest *, pair< const LoopNest *, int > > &parent, const LoopNest *loop) const
void operator=(const State &)=delete
void fuse_gpu_blocks(LoopNest::StageScheduleState *state, Stage &stage, const vector< VarOrRVar > &parallel_vars, const vector< int64_t > &parallel_extents, const vector< int > &constant_extents) const
bool mark_gpu_threads(LoopNest::StageScheduleState *state, Stage &stage, std::unordered_set< std::string > &new_serial_vars, std::ostringstream &staged_funcs_schedule_source) const
NodeMap< bool > always_consider_inline
Definition State.h:75
IntrusivePtr< const State > parent
Definition State.h:26
uint64_t structural_hash(int depth) const
IntrusivePtr< const LoopNest > root
Definition State.h:24
bool exceeds_shared_memory_limit(const Anderson2021Params &params, const Target &target) const
const LoopNest * deepest_valid_compute_location(const Anderson2021Params &params, const map< const LoopNest *, pair< const LoopNest *, int > > &parent, const FunctionDAG::Node &node, const LoopNest *loop, const LoopNest *root, StageMap< int64_t > &total_shared_mem_alloc_sizes) const
bool compute_featurization(const FunctionDAG &dag, const Anderson2021Params &params, const Target &target, StageMap< ScheduleFeatures > *features, Statistics &stats, bool verbose=false) const
bool contains_store_at(const set< const FunctionDAG::Node * > &outermost_store_at, const IntrusivePtr< const LoopNest > &parent) const
void mark_gpu_blocks(LoopNest::StageScheduleState *state, Stage &stage, const vector< VarOrRVar > &parallel_vars, const vector< int64_t > &parallel_extents) const
IntrusivePtr< const LoopNest > get_root_for_features(const Anderson2021Params &params, const Target &target) const
bool calculate_cost(const FunctionDAG &dag, const Anderson2021Params &params, const Target &target, CostModel *cost_model, Statistics &stats, bool verbose=false)
LoopNest * create_feature_root(const PostCreateMutator &post_create_mutator) const
Definition State.h:98
void set_gpu_store_site(const map< const LoopNest *, pair< const LoopNest *, int > > &parent, const LoopNest *loop, LoopNest::Sites &site) const
IntrusivePtr< State > make_child() const
bool exceeds_local_memory_limit(const Anderson2021Params &params, const Target &target) const
const LoopNest * deepest_common_ancestor(const map< const LoopNest *, pair< const LoopNest *, int > > &parent, const LoopNest *a, const LoopNest *b) const
void add_to_always_consider_inline_options(const FunctionDAG::Node *node)
void apply_schedule(const FunctionDAG &dag, const Anderson2021Params &params, const Target &target)
std::vector< double > cost_per_stage
Definition State.h:74
Intrusive shared pointers have a reference count (a RefCount object) stored in the class itself.
A struct representing a target machine and os to generate code for.
Definition Target.h:19