4#include <unordered_map>
5#include <unordered_set>
19namespace Autoscheduler {
22struct GlobalAccessAccumulator;
24struct SharedAccessAccumulator;
26struct LocalAccessAccumulator;
33 static constexpr double bytes_per_transaction = 32;
40 static constexpr double bytes_per_transaction = 128;
47 static constexpr double bytes_per_transaction = 32;
60 return total_num_transactions;
81 total_num_transactions +=
other.total_num_transactions;
82 total_num_bytes_used +=
other.total_num_bytes_used;
83 total_num_bytes +=
other.total_num_bytes;
87 if (total_num_bytes == 0) {
91 double result = total_num_bytes_used / total_num_bytes;
103 double total_num_transactions = 0;
104 double total_num_bytes_used = 0;
105 double total_num_bytes = 0;
117 Strides(
const std::vector<int64_t> &storage_strides)
118 : storage_strides{storage_strides} {
138 for (
size_t i = 0;
i < storage_strides.size(); ++
i) {
141 return std::abs(result);
144 void dump(
bool verbose =
false) {
149 for (
size_t i = 0;
i < storage_strides.size(); ++
i) {
151 aslog(2) <<
"stride " <<
i <<
": invalid\n";
154 aslog(2) <<
"storage_stride " <<
i <<
": " << storage_strides[
i] <<
"\n";
157 for (
size_t i = 0;
i < index_strides.size(); ++
i) {
158 for (
size_t j = 0;
j < index_strides[
i].size(); ++
j) {
159 aslog(2) <<
"index_stride " <<
i <<
", storage_stride " <<
j <<
": " << index_strides[
i][
j] <<
" ";
166 void add(
const std::vector<double> &strides,
bool e) {
167 index_strides.push_back(strides);
168 is_valid.push_back(e);
171 std::vector<int64_t> storage_strides;
172 std::vector<std::vector<double>> index_strides;
173 std::vector<bool> is_valid;
178 : bytes_per_access{bytes_per_access}, dimensions{dimensions}, strides{strides}, verbose{verbose} {
187 aslog(2) <<
"thread_id: " <<
thread_id <<
" (" << x <<
", " << y <<
", " << z <<
")\n";
192 for (
size_t i = 0;
i < dimensions; ++
i) {
193 if (!strides.valid(
i)) {
197 byte += bytes_per_access * strides.offset(
i,
thread_ids[
i]);
201 aslog(2) <<
"byte accessed: " <<
byte <<
"\n";
206 aslog(2) <<
"sectors accessed: ";
208 for (
int i = 0;
i < bytes_per_access; ++
i) {
212 sectors_accessed[
sector].insert(
byte +
i);
230 for (
const auto &
sector : sectors_accessed) {
250 int bytes_per_access;
254 int unknown_sectors = 0;
255 std::unordered_map<int64_t, std::unordered_set<int64_t>> sectors_accessed;
260 : bytes_per_access{bytes_per_access}, dimensions{dimensions}, strides{strides}, verbose{verbose} {
269 aslog(2) <<
"thread_id: " <<
thread_id <<
" (" << x <<
", " << y <<
", " << z <<
")\n";
274 for (
size_t i = 0;
i < dimensions; ++
i) {
275 if (!strides.valid(
i)) {
279 byte += bytes_per_access * strides.offset(
i,
thread_ids[
i]);
283 aslog(2) <<
"bytes accessed: ";
284 for (
int i = 0;
i < bytes_per_access; ++
i) {
285 aslog(2) <<
byte +
i <<
" ";
291 aslog(2) <<
"banks accessed: ";
293 for (
int i = 0;
i < bytes_per_access; ++
i) {
299 bytes_accessed.insert(
byte +
i);
300 bank_to_words_accessed[
bank].insert(
word);
309 for (
const auto &
bank : bank_to_words_accessed) {
340 int bytes_per_access;
344 int unknown_banks = 0;
345 std::unordered_set<int64_t> bytes_accessed;
346 std::array<std::unordered_set<int64_t>, 32> bank_to_words_accessed;
351 : bytes_per_access{bytes_per_access}, verbose{verbose} {
362 aslog(2) <<
"thread_id: " <<
thread_id <<
" (" << x <<
", " << y <<
", " << z <<
")\n";
392 int bytes_per_access;
394 int thread_count = 0;
395 std::unordered_map<int64_t, std::unordered_set<int64_t>> sectors_accessed;
#define internal_assert(c)
typename MemTraits< T >::Accumulator Accumulator
This file defines the class FunctionDAG, which is our representation of a Halide pipeline,...
@ Internal
Not visible externally, similar to 'static' linkage in C.
Expr cast(Expr a)
Cast an expression to the halide type corresponding to the C++ type T.
signed __INT64_TYPE__ int64_t
GlobalAccessAccumulator(int bytes_per_access, size_t dimensions, const Strides &strides, bool verbose)
void operator()(int thread_id, int x, int y, int z, int active, bool last_thread)
void add_access_info(int num_requests, GlobalMemInfo &global_mem_info, bool is_tail_warp) const
void add_access_info(int num_requests, LocalMemInfo &local_mem_info, bool is_tail_warp) const
LocalAccessAccumulator(int bytes_per_access, bool verbose)
void operator()(int thread_id, int x, int y, int z, int active, bool last_thread)
void add_access_info(double num_requests, double num_transactions_per_request, double num_bytes_used_per_request)
static constexpr double bytes_per_transaction
double efficiency() const
double num_transactions() const
void add(const MemInfo< T > &other)
void operator()(int thread_id, int x, int y, int z, int active, bool last_thread)
SharedAccessAccumulator(int bytes_per_access, size_t dimensions, const Strides &strides, bool verbose)
void add_access_info(int num_requests, SharedMemInfo &shared_mem_info, bool is_tail_warp) const
bool valid(size_t loop_index) const
void dump(bool verbose=false)
void add_valid(const std::vector< double > &strides)
Strides(const std::vector< int64_t > &storage_strides)
int64_t offset(size_t loop_index, int64_t point) const