// We can kick off the search and let the auto-scheduler do its magic.
// After some measurement trials, it will return the best schedule it found.
sch, args = auto_scheduler.auto_schedule(task, tuning_options=tune_option)
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// We can lower the schedule to see the IR after auto-scheduling.
// The auto-scheduler correctly performs optimizations including multi-level tiling,
// parallelization, vectorization, unrolling and operator fusion.
print(tvm.lower(sch, args, simple_mode=True))
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Check correctness and evaluate performance
// ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
// We build the binary and check its correctness and performance.
func = tvm.build(sch, args)
a_np = np.random.uniform(size=(128, 128)).astype(np.float32)
b_np = np.random.uniform(size=(128, 128)).astype(np.float32)
c_np = np.random.uniform(size=(128, 128)).astype(np.float32)
out_np = a_np.dot(b_np) + c_np
ctx = tvm.cpu()
a_tvm = tvm.nd.array(a_np, ctx=ctx)
b_tvm = tvm.nd.array(b_np, ctx=ctx)
c_tvm = tvm.nd.array(c_np, ctx=ctx)
out_tvm = tvm.nd.empty(out_np.shape, ctx=ctx)
func(a_tvm, b_tvm, c_tvm, out_tvm)
// Check results
np.testing.assert_allclose(out_np, out_tvm.asnumpy(), rtol=1e-3)
// Evaluate execution time.
evaluator = func.time_evaluator(func.entry_name, ctx, min_repeat_ms=500)
print(
"Execution time of this operator: %.3f ms"
% (np.median(evaluator(a_tvm, b_tvm, c_tvm, out_tvm).results) * 1000)
)
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Using the record file
// ^^^^^^^^^^^^^^^^^^^^^
// During the search, all measuremnt records are dumpped into the record
// file "matmul.json". The measurement records can be used to re-apply search results,
// resume the search, and perform other analyses.
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Here is an example where we load the best schedule from a file,
// print the equivalent python schedule API, and build the binary again.
// Load the measuremnt record for the best schedule
inp, res = auto_scheduler.load_best(log_file, task.workload_key)
// Print equivalent python schedule API. This can be used for debugging and
// learning the behavior of the auto-scheduler.
print("Equivalent python schedule:")
After Change
// Run auto-tuning (search)
task.tune(tune_option)
// Apply the best schedule
sch, args = task.apply_best(log_file)
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// We can lower the schedule to see the IR after auto-scheduling.
// The auto-scheduler correctly performs optimizations including multi-level tiling,