RandomTree commited on
Commit
3a88875
·
verified ·
1 Parent(s): ac9a247

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +27 -0
  2. context_encoding_model/_tp0_bk0/command.txt +1 -0
  3. context_encoding_model/_tp0_bk0/compile_flags.MODULE_e7e1b6c43bb87ca73ecc+2ee9f01d.json +1 -0
  4. context_encoding_model/_tp0_bk0/global_metric_store.json +1147 -0
  5. context_encoding_model/_tp0_bk0/graph.neff +3 -0
  6. context_encoding_model/_tp0_bk0/log-neuron-cc.txt +0 -0
  7. context_encoding_model/_tp0_bk0/metaneff.pb +3 -0
  8. context_encoding_model/_tp0_bk0/model.MODULE_e7e1b6c43bb87ca73ecc+2ee9f01d.hlo_module.pb +3 -0
  9. context_encoding_model/_tp0_bk0/model.MODULE_e7e1b6c43bb87ca73ecc+2ee9f01d.neff +3 -0
  10. context_encoding_model/_tp0_bk0/neuron_config.json +224 -0
  11. context_encoding_model/_tp0_bk1/command.txt +1 -0
  12. context_encoding_model/_tp0_bk1/compile_flags.MODULE_2330bfb0632c950ddab1+62ecd68b.json +1 -0
  13. context_encoding_model/_tp0_bk1/global_metric_store.json +1177 -0
  14. context_encoding_model/_tp0_bk1/graph.neff +3 -0
  15. context_encoding_model/_tp0_bk1/log-neuron-cc.txt +0 -0
  16. context_encoding_model/_tp0_bk1/metaneff.pb +3 -0
  17. context_encoding_model/_tp0_bk1/model.MODULE_2330bfb0632c950ddab1+62ecd68b.hlo_module.pb +3 -0
  18. context_encoding_model/_tp0_bk1/model.MODULE_2330bfb0632c950ddab1+62ecd68b.neff +3 -0
  19. context_encoding_model/_tp0_bk1/neuron_config.json +224 -0
  20. context_encoding_model/_tp0_bk2/command.txt +1 -0
  21. context_encoding_model/_tp0_bk2/compile_flags.MODULE_49bb42f69f5b159ae769+3467f95e.json +1 -0
  22. context_encoding_model/_tp0_bk2/global_metric_store.json +1177 -0
  23. context_encoding_model/_tp0_bk2/graph.neff +3 -0
  24. context_encoding_model/_tp0_bk2/log-neuron-cc.txt +0 -0
  25. context_encoding_model/_tp0_bk2/metaneff.pb +3 -0
  26. context_encoding_model/_tp0_bk2/model.MODULE_49bb42f69f5b159ae769+3467f95e.hlo_module.pb +3 -0
  27. context_encoding_model/_tp0_bk2/model.MODULE_49bb42f69f5b159ae769+3467f95e.neff +3 -0
  28. context_encoding_model/_tp0_bk2/neuron_config.json +224 -0
  29. context_encoding_model/_tp0_bk3/command.txt +1 -0
  30. context_encoding_model/_tp0_bk3/compile_flags.MODULE_be035899334776123ed5+d208bdce.json +1 -0
  31. context_encoding_model/_tp0_bk3/global_metric_store.json +1177 -0
  32. context_encoding_model/_tp0_bk3/graph.neff +3 -0
  33. context_encoding_model/_tp0_bk3/log-neuron-cc.txt +0 -0
  34. context_encoding_model/_tp0_bk3/metaneff.pb +3 -0
  35. context_encoding_model/_tp0_bk3/model.MODULE_be035899334776123ed5+d208bdce.hlo_module.pb +3 -0
  36. context_encoding_model/_tp0_bk3/model.MODULE_be035899334776123ed5+d208bdce.neff +3 -0
  37. context_encoding_model/_tp0_bk3/neuron_config.json +224 -0
  38. context_encoding_model/_tp0_bk4/command.txt +1 -0
  39. context_encoding_model/_tp0_bk4/compile_flags.MODULE_95ef7ca73cc0a6161be2+96be3c33.json +1 -0
  40. context_encoding_model/_tp0_bk4/global_metric_store.json +1177 -0
  41. context_encoding_model/_tp0_bk4/graph.neff +3 -0
  42. context_encoding_model/_tp0_bk4/log-neuron-cc.txt +0 -0
  43. context_encoding_model/_tp0_bk4/metaneff.pb +3 -0
  44. context_encoding_model/_tp0_bk4/model.MODULE_95ef7ca73cc0a6161be2+96be3c33.hlo_module.pb +3 -0
  45. context_encoding_model/_tp0_bk4/model.MODULE_95ef7ca73cc0a6161be2+96be3c33.neff +3 -0
  46. context_encoding_model/_tp0_bk4/neuron_config.json +224 -0
  47. context_encoding_model/_tp0_bk5/command.txt +1 -0
  48. context_encoding_model/_tp0_bk5/compile_flags.MODULE_96a8f4e12dc810958634+b1e26cef.json +1 -0
  49. context_encoding_model/_tp0_bk5/global_metric_store.json +1177 -0
  50. context_encoding_model/_tp0_bk5/graph.neff +3 -0
.gitattributes CHANGED
@@ -33,3 +33,30 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ context_encoding_model/_tp0_bk0/graph.neff filter=lfs diff=lfs merge=lfs -text
37
+ context_encoding_model/_tp0_bk0/model.MODULE_e7e1b6c43bb87ca73ecc+2ee9f01d.neff filter=lfs diff=lfs merge=lfs -text
38
+ context_encoding_model/_tp0_bk1/graph.neff filter=lfs diff=lfs merge=lfs -text
39
+ context_encoding_model/_tp0_bk1/model.MODULE_2330bfb0632c950ddab1+62ecd68b.neff filter=lfs diff=lfs merge=lfs -text
40
+ context_encoding_model/_tp0_bk2/graph.neff filter=lfs diff=lfs merge=lfs -text
41
+ context_encoding_model/_tp0_bk2/model.MODULE_49bb42f69f5b159ae769+3467f95e.neff filter=lfs diff=lfs merge=lfs -text
42
+ context_encoding_model/_tp0_bk3/graph.neff filter=lfs diff=lfs merge=lfs -text
43
+ context_encoding_model/_tp0_bk3/model.MODULE_be035899334776123ed5+d208bdce.neff filter=lfs diff=lfs merge=lfs -text
44
+ context_encoding_model/_tp0_bk4/graph.neff filter=lfs diff=lfs merge=lfs -text
45
+ context_encoding_model/_tp0_bk4/model.MODULE_95ef7ca73cc0a6161be2+96be3c33.neff filter=lfs diff=lfs merge=lfs -text
46
+ context_encoding_model/_tp0_bk5/graph.neff filter=lfs diff=lfs merge=lfs -text
47
+ context_encoding_model/_tp0_bk5/model.MODULE_96a8f4e12dc810958634+b1e26cef.neff filter=lfs diff=lfs merge=lfs -text
48
+ layout_opt/graph.neff filter=lfs diff=lfs merge=lfs -text
49
+ layout_opt/model/graph.hlo filter=lfs diff=lfs merge=lfs -text
50
+ token_generation_model/_tp0_bk0/graph.neff filter=lfs diff=lfs merge=lfs -text
51
+ token_generation_model/_tp0_bk0/model.MODULE_caeca0352a0240106f96+d5490f71.neff filter=lfs diff=lfs merge=lfs -text
52
+ token_generation_model/_tp0_bk0/wrapped_neff.hlo filter=lfs diff=lfs merge=lfs -text
53
+ token_generation_model/_tp0_bk1/graph.neff filter=lfs diff=lfs merge=lfs -text
54
+ token_generation_model/_tp0_bk1/model.MODULE_122f32d499d16ac150a0+bdebe6e1.neff filter=lfs diff=lfs merge=lfs -text
55
+ token_generation_model/_tp0_bk2/graph.neff filter=lfs diff=lfs merge=lfs -text
56
+ token_generation_model/_tp0_bk2/model.MODULE_bac42b9b464c64624582+1ea12800.neff filter=lfs diff=lfs merge=lfs -text
57
+ token_generation_model/_tp0_bk3/graph.neff filter=lfs diff=lfs merge=lfs -text
58
+ token_generation_model/_tp0_bk3/model.MODULE_8aa2bc135acfce1f4a61+bd0ab490.neff filter=lfs diff=lfs merge=lfs -text
59
+ token_generation_model/_tp0_bk4/graph.neff filter=lfs diff=lfs merge=lfs -text
60
+ token_generation_model/_tp0_bk4/model.MODULE_ec05e5a8222761962028+3b7d8ecf.neff filter=lfs diff=lfs merge=lfs -text
61
+ token_generation_model/_tp0_bk5/graph.neff filter=lfs diff=lfs merge=lfs -text
62
+ token_generation_model/_tp0_bk5/model.MODULE_b0c5e51af4aeb4ea04b2+a0432539.neff filter=lfs diff=lfs merge=lfs -text
context_encoding_model/_tp0_bk0/command.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ neuronx-cc compile --framework=XLA model.MODULE_e7e1b6c43bb87ca73ecc+2ee9f01d.hlo_module.pb --output model.MODULE_e7e1b6c43bb87ca73ecc+2ee9f01d.neff --target=trn2 --auto-cast=none --model-type=transformer '--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma ' --lnc=2 -O1 '--internal-hlo2tensorizer-options= --modular-flow-mac-threshold=10 --verify-hlo=true' --logfile=log-neuron-cc.txt --verbose=35
context_encoding_model/_tp0_bk0/compile_flags.MODULE_e7e1b6c43bb87ca73ecc+2ee9f01d.json ADDED
@@ -0,0 +1 @@
 
 
1
+ ["--target=trn2", "--auto-cast=none", "--model-type=transformer", "--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma ", "--lnc=2", "-O1", "--internal-hlo2tensorizer-options= --modular-flow-mac-threshold=10 --verify-hlo=true", "--logfile=/home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk0/log-neuron-cc.txt"]
context_encoding_model/_tp0_bk0/global_metric_store.json ADDED
@@ -0,0 +1,1147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "Average": {
3
+ "tensorizer": {
4
+ "StaticProfiler::AverageFractalPeUtilization": 98.70232391357422,
5
+ "StaticProfiler::AveragePartitionUtilization": 94.02606201171875,
6
+ "StaticProfiler::AveragePeUtilization": 96.57791900634766,
7
+ "StaticProfiler::LocalizationEfficiency": 96.75444030761719,
8
+ "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 99.23246002197266,
9
+ "TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0,
10
+ "TilingProfiler::AveragePeUtilizationAfterTiling": 0.0
11
+ }
12
+ },
13
+ "Count": {
14
+ "tensorizer": {
15
+ "StaticProfiler::AverageFractalPeUtilization": 1.0,
16
+ "StaticProfiler::AveragePartitionUtilization": 1.0,
17
+ "StaticProfiler::AveragePeUtilization": 1.0,
18
+ "StaticProfiler::LocalizationEfficiency": 1.0,
19
+ "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 1.0,
20
+ "TilingProfiler::AveragePartitionUtilizationAfterTiling": 1.0,
21
+ "TilingProfiler::AveragePeUtilizationAfterTiling": 1.0
22
+ }
23
+ },
24
+ "Sum": {
25
+ "compiletime": {
26
+ "AGOrderingAnalysisPass": 0.037471771240234375,
27
+ "AffinePredicateResolution": 0.0048100948333740234,
28
+ "AliasDependencyElimination": 0.0002529621124267578,
29
+ "AliasDependencyInduction": 0.005568504333496094,
30
+ "AliasDependencyReset": 0.11161017417907715,
31
+ "BFComputeCutting": 0.0024290084838867188,
32
+ "BirCodeGenLoop": 0.32352304458618164,
33
+ "CCOpFusion": 0.033486366271972656,
34
+ "CanonicalizeConv": 2.7000001864507794e-05,
35
+ "CanonicalizeDAGForPGTiling": 0.004197120666503906,
36
+ "CanonicalizeForTensorizer": 3.899999865097925e-05,
37
+ "CanonicalizeIR": 0.0025298595428466797,
38
+ "Canonicalizer": 0.00088900001719594,
39
+ "CoalesceCCOp": 0.014135599136352539,
40
+ "CommuteConcat": 0.0018744468688964844,
41
+ "DMALocalityOpt": 0.01189279556274414,
42
+ "DMAProfiler": 0.025990962982177734,
43
+ "DMATilingProfiler": 0.015254497528076172,
44
+ "DataLocalityOpt": 0.1120154857635498,
45
+ "DataStreaming": 0.03728485107421875,
46
+ "DeConcat": 0.0022406578063964844,
47
+ "DeadCodeElimination": 0.0021486282348632813,
48
+ "DeadStoreElimination": 0.0063364505767822266,
49
+ "DelinearIndices": 0.0064697265625,
50
+ "Delinearization": 0.004486560821533203,
51
+ "DelinearizeSPMD": 0.01732611656188965,
52
+ "DoNothing": 0.0007770061492919922,
53
+ "DramToDramTranspose": 0.02082037925720215,
54
+ "DumpGraphAndMetadata": 0.036411285400390625,
55
+ "EliminateDivs": 0.01006174087524414,
56
+ "ExpandBatchNorm": 0.0024886131286621094,
57
+ "ExpandISAMacro": 0.01822209358215332,
58
+ "FactorizeBlkDims": 0.07448649406433105,
59
+ "FactorizeThreadAxesInFreeDims": 0.0071103572845458984,
60
+ "FlattenMacroLoop": 0.009794235229492188,
61
+ "GenericAccessSimplifier": 0.0009224414825439453,
62
+ "HoistCompute": 7.000000096013537e-06,
63
+ "IdentifyCrossPassTensors": 3.600000127335079e-05,
64
+ "InferInitValue": 0.12128233909606934,
65
+ "InferIntrinsicOnCC": 0.01005697250366211,
66
+ "InferNeuronTensor": 0.029047489166259766,
67
+ "InferNonlocalTensors": 0.017493009567260742,
68
+ "InferPSumTensor": 0.09335684776306152,
69
+ "InferShardAxis": 0.26027798652648926,
70
+ "InferSharedMemLoc": 0.016659259796142578,
71
+ "InlineNativeKernels": 0.002816915512084961,
72
+ "InsertCoreBarrier": 0.0162966251373291,
73
+ "InsertIOTransposes": 0.019797325134277344,
74
+ "InsertImplicitShardAxisBeforeISel": 0.05061173439025879,
75
+ "InsertLocalTransposes": 0.004299163818359375,
76
+ "InsertOffloadedTransposes": 0.008011579513549805,
77
+ "LICM": 0.009003639221191406,
78
+ "LateLegalizeInst": 0.035849571228027344,
79
+ "LateLegalizePostSplit": 0.013758182525634766,
80
+ "LateLowerReshapeOp": 0.0012693405151367188,
81
+ "LateLowerTensorOp": 0.002027750015258789,
82
+ "LateNeuronInstComb": 0.14670348167419434,
83
+ "LayoutPreprocessing": 0.025156497955322266,
84
+ "LayoutPreprocessingAndAnalysis": 0.06950831413269043,
85
+ "LayoutRequirementAnalysis": 0.0069408416748046875,
86
+ "LegalizeCCOpLayout": 0.003494739532470703,
87
+ "LegalizeOpLevelAlias": 0.0016810894012451172,
88
+ "LegalizePartitionReduce": 0.0026693344116210938,
89
+ "LegalizeSundaAccess": 0.08684325218200684,
90
+ "LegalizeSundaMacro": 0.10486245155334473,
91
+ "LegalizeType": 0.06927132606506348,
92
+ "LocalLayoutOpt": 0.012215137481689453,
93
+ "LoopFusion": 0.0049479007720947266,
94
+ "LoopSplitting": 0.0008144378662109375,
95
+ "LowerBroadcast": 0.019241809844970703,
96
+ "LowerCCOpBlockAxis": 0.0037145614624023438,
97
+ "LowerComplexBroadcast": 0.0070230960845947266,
98
+ "LowerIntrinsics": 0.0899801254272461,
99
+ "LowerShardAxis": 0.020240068435668945,
100
+ "LowerTensorOp": 0.028459787368774414,
101
+ "LowerToSendRecv": 0.02129983901977539,
102
+ "LowerTranspose": 0.06694269180297852,
103
+ "MacroGeneration": 0.03631877899169922,
104
+ "MaskPropagation": 0.004620075225830078,
105
+ "MemcastMotion": 2.89999989036005e-05,
106
+ "MemcpyElimination": 0.04741477966308594,
107
+ "MutateDataType": 0.002264261245727539,
108
+ "NeuronAliasDependencyInduction": 0.002180337905883789,
109
+ "NeuronAliasDependencyReset": 0.08514618873596191,
110
+ "NeuronInstComb": 0.05580711364746094,
111
+ "NeuronLICM": 0.047100067138671875,
112
+ "NeuronLoopFusion": 0.05364656448364258,
113
+ "NeuronLoopInterchange": 0.002526521682739258,
114
+ "NeuronSimplifier": 0.06896662712097168,
115
+ "NeuronSimplifyPredicates": 0.042169809341430664,
116
+ "NeuronValueNumbering": 0.025714874267578125,
117
+ "OptimizeAliasedCopyChain": 0.0007548332214355469,
118
+ "OptimizeNKIKernels": 4.075549602508545,
119
+ "PAGLayoutOpt": 0.1111152172088623,
120
+ "PComputeCutting": 0.005707263946533203,
121
+ "PGLayoutTilingPipeline": 1.204958438873291,
122
+ "PGTiling": 0.4116194248199463,
123
+ "PadElimination": 0.0003600120544433594,
124
+ "ParAxesAnnotation": 0.050878286361694336,
125
+ "PartialLoopFusion": 0.0372469425201416,
126
+ "PartialSimdFusion": 0.021113157272338867,
127
+ "PenguinizeFunctions": 3.199999991920777e-05,
128
+ "PerfectLoopNest": 0.007718086242675781,
129
+ "PruneFunctions": 3.400000059627928e-05,
130
+ "RecognizeOpIdiom": 0.0058002471923828125,
131
+ "Recompute": 0.0017511844635009766,
132
+ "RelaxPredicates": 0.00795745849609375,
133
+ "Rematerialization": 0.0019276142120361328,
134
+ "RemoveOptimizationBarriers": 8.50000069476664e-05,
135
+ "RemoveShardedPartitionAxes": 0.008410930633544922,
136
+ "ReshapeWeights": 0.0063934326171875,
137
+ "ResolveAccessConflict": 0.01411294937133789,
138
+ "ResolveComplicatePredicates": 0.004876375198364258,
139
+ "RewriteReplicationMatmul": 0.0017600059509277344,
140
+ "RewriteWeights": 0.004542827606201172,
141
+ "SFKVectorizer": 0.3233633041381836,
142
+ "ScatterMotion": 5.7999997807201e-05,
143
+ "ShardingPropagationAnalysis": 0.06259655952453613,
144
+ "SimpleAllReduceTiling": 0.010744571685791016,
145
+ "Simplifier": 0.0033507347106933594,
146
+ "SimplifyMacroPredicates": 0.056143999099731445,
147
+ "SimplifyNeuronTensor": 0.1345655918121338,
148
+ "SimplifySlice": 0.001861572265625,
149
+ "SimplifyTensor": 0.02954578399658203,
150
+ "SpillPSum": 0.11643767356872559,
151
+ "SplitAPUnionSets": 0.07312703132629395,
152
+ "SplitAccGrp": 0.002663135528564453,
153
+ "StaticProfiler": 0.02257680892944336,
154
+ "StaticTransposeLocalTensor": 0.003572225570678711,
155
+ "SundaISel": 0.10315561294555664,
156
+ "TCTransform": 0.0025663375854492188,
157
+ "TensorInitialization": 0.00860285758972168,
158
+ "TensorOpSimplifier": 0.008630037307739258,
159
+ "TensorOpTransform": 0.028581619262695313,
160
+ "TensorizerLegalizationPass": 4.600000102072954e-05,
161
+ "TileCCOps": 0.00518488883972168,
162
+ "TilingProfiler": 0.023342609405517578,
163
+ "TransformConvOp": 0.008756637573242188,
164
+ "TritiumFusion": 0.13446974754333496,
165
+ "ValueNumbering": 0.003237485885620117,
166
+ "VectorizeDMA": 0.028183698654174805,
167
+ "VectorizeMatMult": 0.015199661254882813,
168
+ "VerifySupportedOps": 3.400000059627928e-05,
169
+ "WeightCoalescing": 0.01640915870666504,
170
+ "ZeroSizeTensorElimination": 0.0001671314239501953,
171
+ "algsimp": 0.0017099999822676182,
172
+ "batchnorm_expander": 3.400000059627928e-05,
173
+ "boundary-marker-removal": 1.2000000424450263e-05,
174
+ "call-inliner": 0.0002339999919058755,
175
+ "canonicalize-boundary-marker": 1.4999999621068127e-05,
176
+ "collective-stream-id-checker": 6.299999949987978e-05,
177
+ "comparison-expander": 0.0005050000036135316,
178
+ "computation-deduplicator": 5.100000271340832e-05,
179
+ "config-lowering": 0.0002690000110305846,
180
+ "constant-statistics": 0.000455000001238659,
181
+ "constant_folding": 0.00023099999816622585,
182
+ "cse": 3.7000001611886546e-05,
183
+ "dce": 6.000000212225132e-05,
184
+ "dot_decomposer": 0.0009510000236332417,
185
+ "dynamic-slice-transpose": 1.2999998943996616e-05,
186
+ "eliminate-redundant-compare": 0.00020500000391621143,
187
+ "emit-offloaded-dropout": 8.399999933317304e-05,
188
+ "flatten-call-graph": 0.0006050000083632767,
189
+ "fuse-send-recv": 5.199999577598646e-05,
190
+ "hilo-conditional-to-select": 1.4000000192027073e-05,
191
+ "hilo::LegalizeAlias": 1.2000000424450263e-05,
192
+ "hilo::NeuronInstCombine": 0.0001320000010309741,
193
+ "hilo::NeuronOpFusion": 9.099999442696571e-05,
194
+ "hilo::ReplaceTokenTypeWithU8Pass": 3.300000025774352e-05,
195
+ "hilo::ScheduleFusion": 5.999999757477781e-06,
196
+ "hilo::SixtyFourHack": 5.999999848427251e-05,
197
+ "hilo::VerifyAliasing": 3.999999989900971e-06,
198
+ "hlo-mac-count": 0.012813999317586422,
199
+ "instruction-histogram": 0.0005469999741762877,
200
+ "io-con-pipe-begin": 4.999999873689376e-06,
201
+ "io-con-pipe-end": 9.999999974752427e-07,
202
+ "io-layout-normalization": 0.0009079999872483313,
203
+ "io-statistics": 4.400000034365803e-05,
204
+ "legalize-ccops-for-tensorizer": 3.999999989900971e-06,
205
+ "legalize-compare": 1.1000000085914508e-05,
206
+ "lower-argminmax-custom-call": 9.999999747378752e-06,
207
+ "map-inline": 0.0007319999858736992,
208
+ "metadata-naming": 4.3000000005122274e-05,
209
+ "mlir::detail::OpToOpPassAdaptor": 7.100000220816582e-05,
210
+ "mlir::hlo::MhloToPyPenguin": 0.006075000390410423,
211
+ "mlir::mhlo::LowerComplexExtraPass": 0.0002460000105202198,
212
+ "mlir::mhlo::LowerComplexPass": 0.00047699996503069997,
213
+ "native-to-custom-softmax": 0.0005559999844990671,
214
+ "native-to-custom-softmax-dx": 0.0005599999567493796,
215
+ "neuron-hlo-verifier": 0.010796000249683857,
216
+ "operand_upcaster": 4.199999966658652e-05,
217
+ "opt-barrier-removal": 0.00039500001003034413,
218
+ "post-par-pipe-begin": 4.70000013592653e-05,
219
+ "post-par-pipe-end": 0.0,
220
+ "post-partition-simplification": 0.001361000002361834,
221
+ "pre-par-pipe-begin": 9.999999974752427e-07,
222
+ "pre-par-pipe-end": 0.0,
223
+ "pre-partition-simplification": 0.05799899995326996,
224
+ "replace-minimum-constant": 0.0003459999861661345,
225
+ "reshape-mover": 8.900000102585182e-05,
226
+ "simplify-concat": 0.00010900000052060932,
227
+ "simplify-while-loops": 5.900000178371556e-05,
228
+ "transform-variadic-reduce": 5.699999746866524e-05,
229
+ "tuple-simplifier": 0.00020900000527035445,
230
+ "unpack-nested-aws-ntwsr": 0.00026500000967644155,
231
+ "unroll-while-loop": 9.000000318337698e-06,
232
+ "zero_sized_hlo_elimination": 0.0007340000011026859
233
+ },
234
+ "hilo": {
235
+ "ConstantSize": 238229.0,
236
+ "HloInputCount": 371.0,
237
+ "HloMacCount": 6666190848.0,
238
+ "HloOutputCount": 57.0,
239
+ "IfmapSize": 3910913024.0,
240
+ "OfmapSize": 1879048192.0,
241
+ "OutputsReadFromCount": 0.0,
242
+ "PassthroughTensorsCount": 0.0,
243
+ "RedundantOutputCount": 0.0,
244
+ "Traffic": 864804480.0
245
+ },
246
+ "tensorizer": {
247
+ "DMATilingProfiler::TotalInstructionsAfterTiling": 20773.0,
248
+ "StaticProfiler::AifUb": 131.73849487304688,
249
+ "StaticProfiler::ArithmeticIntensityTensorizer": 127.46285247802734,
250
+ "StaticProfiler::AverageDmaLength": 2400.2490234375,
251
+ "StaticProfiler::DDRTransferBytes": 361746464.0,
252
+ "StaticProfiler::InternalTransferBytes": 320526112.0,
253
+ "StaticProfiler::LoadExpanded": 84060.0,
254
+ "StaticProfiler::StoreExpanded": 1898.0,
255
+ "StaticProfiler::TotalDMAExpanded": 85958.0,
256
+ "StaticProfiler::TotalDynamicInstancesCount": 25131.0,
257
+ "StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 24680.0,
258
+ "StaticProfiler::TotalLNCComm": 0.0,
259
+ "StaticProfiler::TotalLNCCommTransfer": 0.0,
260
+ "TilingProfiler::BatchnormInstructionsAfterTiling": 0.0,
261
+ "TilingProfiler::DmaInstructionsAfterTiling": 0.0,
262
+ "TilingProfiler::GenericInstructionsAfterTiling": 4.0,
263
+ "TilingProfiler::MatMultInstructionsAfterTiling": 10368.0,
264
+ "TilingProfiler::NumPfTransposes": 6.0,
265
+ "TilingProfiler::NumPfTransposesForIo": 1.0,
266
+ "TilingProfiler::NumPfTransposesForLocal": 1.0,
267
+ "TilingProfiler::NumPfTransposesForNonlocal": 4.0,
268
+ "TilingProfiler::PfTransposeInstructions": 10147.0,
269
+ "TilingProfiler::PfTransposeInstructionsForIo": 9504.0,
270
+ "TilingProfiler::PfTransposeInstructionsForLocal": 1.0,
271
+ "TilingProfiler::PfTransposeInstructionsForNonlocal": 642.0,
272
+ "TilingProfiler::ReduceInstructionsAfterTiling": 4.0,
273
+ "TilingProfiler::SimdInstructionsAfterTiling": 92.0,
274
+ "TilingProfiler::TotalInstructionsAfterTiling": 0.0,
275
+ "TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0,
276
+ "TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0,
277
+ "TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0,
278
+ "TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0,
279
+ "TransformConvOp::conv2d_column_packing": 0.0,
280
+ "TransformConvOp::conv2d_column_packing_1": 0.0,
281
+ "TransformConvOp::conv2d_column_packing_io10": 0.0,
282
+ "TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0
283
+ }
284
+ },
285
+ "all": {
286
+ "compiletime": {
287
+ "algsimp": 0.001560000004246831,
288
+ "call-inliner": 0.0002099999983329326,
289
+ "collective-stream-id-checker": 5.6000000768108293e-05,
290
+ "comparison-expander": 0.0004900000058114529,
291
+ "constant-statistics": 0.000455000001238659,
292
+ "constant_folding": 0.00020900000527035445,
293
+ "dce": 5.700000110664405e-05,
294
+ "dot_decomposer": 0.0009510000236332417,
295
+ "eliminate-redundant-compare": 0.00019500000053085387,
296
+ "flatten-call-graph": 0.0005799999926239252,
297
+ "hlo-mac-count": 0.00829899962991476,
298
+ "instruction-histogram": 0.0005469999741762877,
299
+ "io-con-pipe-begin": 4.999999873689376e-06,
300
+ "io-con-pipe-end": 9.999999974752427e-07,
301
+ "io-layout-normalization": 0.0009079999872483313,
302
+ "io-statistics": 4.400000034365803e-05,
303
+ "map-inline": 0.0007019999902695417,
304
+ "native-to-custom-softmax": 0.0005370000144466758,
305
+ "native-to-custom-softmax-dx": 0.00047599998652003706,
306
+ "neuron-hlo-verifier": 0.009705999866127968,
307
+ "opt-barrier-removal": 0.00039500001003034413,
308
+ "pre-par-pipe-begin": 9.999999974752427e-07,
309
+ "pre-par-pipe-end": 0.0,
310
+ "pre-partition-simplification": 0.05799899995326996,
311
+ "replace-minimum-constant": 0.0003279999946244061,
312
+ "reshape-mover": 7.999999797903001e-05,
313
+ "simplify-while-loops": 5.2999999752501026e-05,
314
+ "tuple-simplifier": 0.00019700000120792538,
315
+ "unpack-nested-aws-ntwsr": 0.00025400001322850585,
316
+ "unroll-while-loop": 9.000000318337698e-06,
317
+ "zero_sized_hlo_elimination": 0.0007340000011026859
318
+ }
319
+ },
320
+ "cumsum": {
321
+ "compiletime": {
322
+ "CoalesceCCOp": 0.00032806396484375,
323
+ "DMALocalityOpt": 0.00027751922607421875,
324
+ "DMAProfiler": 0.0011353492736816406,
325
+ "DataStreaming": 0.00044035911560058594,
326
+ "DoNothing": 0.0001888275146484375,
327
+ "ExpandISAMacro": 0.003916263580322266,
328
+ "FactorizeBlkDims": 0.001834869384765625,
329
+ "InferPSumTensor": 0.0010616779327392578,
330
+ "InferSharedMemLoc": 0.00044918060302734375,
331
+ "InsertCoreBarrier": 0.0004329681396484375,
332
+ "LateLegalizeInst": 0.002650022506713867,
333
+ "LateNeuronInstComb": 0.002856016159057617,
334
+ "LegalizeSundaAccess": 0.002493619918823242,
335
+ "LegalizeType": 0.0004024505615234375,
336
+ "LowerBroadcast": 0.00041794776916503906,
337
+ "LowerIntrinsics": 0.0003495216369628906,
338
+ "LowerTranspose": 0.00037598609924316406,
339
+ "NeuronInstComb": 0.0011763572692871094,
340
+ "NeuronLICM": 0.0014426708221435547,
341
+ "NeuronSimplifyPredicates": 0.012172937393188477,
342
+ "NeuronValueNumbering": 0.0006816387176513672,
343
+ "SFKVectorizer": 0.011650562286376953,
344
+ "SimpleAllReduceTiling": 0.00033855438232421875,
345
+ "SimplifyNeuronTensor": 0.0009646415710449219,
346
+ "SpillPSum": 0.0025339126586914063,
347
+ "WeightCoalescing": 0.0003387928009033203
348
+ }
349
+ },
350
+ "sg00": {
351
+ "compiletime": {
352
+ "CanonicalizeConv": 7.000000096013537e-06,
353
+ "CanonicalizeForTensorizer": 1.5999999959603883e-05,
354
+ "Canonicalizer": 0.00033000000985339284,
355
+ "HoistCompute": 1.9999999949504854e-06,
356
+ "IdentifyCrossPassTensors": 1.4000000192027073e-05,
357
+ "MemcastMotion": 9.999999747378752e-06,
358
+ "PenguinizeFunctions": 1.4999999621068127e-05,
359
+ "PruneFunctions": 1.4999999621068127e-05,
360
+ "RemoveOptimizationBarriers": 3.300000025774352e-05,
361
+ "ScatterMotion": 2.2000000171829015e-05,
362
+ "TensorizerLegalizationPass": 2.8000000384054147e-05,
363
+ "VerifySupportedOps": 1.2000000424450263e-05,
364
+ "algsimp": 5.199999941396527e-05,
365
+ "batchnorm_expander": 1.1000000085914508e-05,
366
+ "boundary-marker-removal": 3.999999989900971e-06,
367
+ "call-inliner": 7.000000096013537e-06,
368
+ "canonicalize-boundary-marker": 4.999999873689376e-06,
369
+ "collective-stream-id-checker": 1.9999999949504854e-06,
370
+ "comparison-expander": 4.999999873689376e-06,
371
+ "computation-deduplicator": 1.5999999959603883e-05,
372
+ "config-lowering": 0.0001289999927394092,
373
+ "constant_folding": 7.000000096013537e-06,
374
+ "cse": 1.2000000424450263e-05,
375
+ "dce": 9.999999974752427e-07,
376
+ "dynamic-slice-transpose": 4.999999873689376e-06,
377
+ "eliminate-redundant-compare": 3.000000106112566e-06,
378
+ "emit-offloaded-dropout": 4.5000000682193786e-05,
379
+ "flatten-call-graph": 7.999999979801942e-06,
380
+ "fuse-send-recv": 1.8000000636675395e-05,
381
+ "hilo-conditional-to-select": 3.999999989900971e-06,
382
+ "hilo::LegalizeAlias": 6.000000212225132e-06,
383
+ "hilo::NeuronInstCombine": 6.70000008540228e-05,
384
+ "hilo::NeuronOpFusion": 4.099999932805076e-05,
385
+ "hilo::ReplaceTokenTypeWithU8Pass": 1.2999999853491317e-05,
386
+ "hilo::ScheduleFusion": 9.999999974752427e-07,
387
+ "hilo::SixtyFourHack": 1.2000000424450263e-05,
388
+ "hilo::VerifyAliasing": 1.9999999949504854e-06,
389
+ "hlo-mac-count": 1.8999999156221747e-05,
390
+ "legalize-ccops-for-tensorizer": 1.9999999949504854e-06,
391
+ "legalize-compare": 3.999999989900971e-06,
392
+ "lower-argminmax-custom-call": 3.000000106112566e-06,
393
+ "map-inline": 9.000000318337698e-06,
394
+ "metadata-naming": 1.2999999853491317e-05,
395
+ "mlir::detail::OpToOpPassAdaptor": 2.099999983329326e-05,
396
+ "mlir::hlo::MhloToPyPenguin": 0.0009730000165291131,
397
+ "mlir::mhlo::LowerComplexExtraPass": 8.399999933317304e-05,
398
+ "mlir::mhlo::LowerComplexPass": 0.000195999993593432,
399
+ "native-to-custom-softmax": 9.000000318337698e-06,
400
+ "native-to-custom-softmax-dx": 5.500000042957254e-05,
401
+ "neuron-hlo-verifier": 0.0003929999948013574,
402
+ "operand_upcaster": 1.700000029813964e-05,
403
+ "post-par-pipe-begin": 4.400000034365803e-05,
404
+ "post-par-pipe-end": 0.0,
405
+ "post-partition-simplification": 0.00047500000800937414,
406
+ "replace-minimum-constant": 6.000000212225132e-06,
407
+ "reshape-mover": 3.000000106112566e-06,
408
+ "simplify-concat": 3.400000059627928e-05,
409
+ "simplify-while-loops": 1.9999999949504854e-06,
410
+ "transform-variadic-reduce": 7.999999979801942e-06,
411
+ "tuple-simplifier": 3.999999989900971e-06,
412
+ "unpack-nested-aws-ntwsr": 3.999999989900971e-06,
413
+ "unroll-while-loop": 0.0
414
+ },
415
+ "hilo": {
416
+ "ArithmeticIntensity": 4.265669345855713,
417
+ "ConstantSize": 238229.0,
418
+ "HloInputCount": 371.0,
419
+ "HloMacCount": 838860800.0,
420
+ "HloOutputCount": 57.0,
421
+ "IfmapSize": 3910913024.0,
422
+ "OfmapSize": 1879048192.0,
423
+ "OutputsReadFromCount": 0.0,
424
+ "PassthroughTensorsCount": 0.0,
425
+ "RedundantOutputCount": 0.0,
426
+ "Traffic": 393307936.0
427
+ }
428
+ },
429
+ "sg0000": {
430
+ "compiletime": {
431
+ "AGOrderingAnalysisPass": 0.04803347587585449,
432
+ "AffinePredicateResolution": 0.0014185905456542969,
433
+ "AliasDependencyElimination": 0.0002288818359375,
434
+ "AliasDependencyInduction": 0.023572683334350586,
435
+ "AliasDependencyReset": 0.050307273864746094,
436
+ "BFComputeCutting": 0.0020284652709960938,
437
+ "BirCodeGenLoop": 0.06627583503723145,
438
+ "CCOpFusion": 0.030767440795898438,
439
+ "CanonicalizeDAGForPGTiling": 0.005156278610229492,
440
+ "CanonicalizeIR": 0.0024123191833496094,
441
+ "CoalesceCCOp": 0.017067909240722656,
442
+ "CommuteConcat": 0.0011420249938964844,
443
+ "DMALocalityOpt": 0.0021338462829589844,
444
+ "DMAProfiler": 0.015033483505249023,
445
+ "DMATilingProfiler": 0.006984710693359375,
446
+ "DataLocalityOpt": 0.3054013252258301,
447
+ "DataStreaming": 0.014647245407104492,
448
+ "DeConcat": 0.005982398986816406,
449
+ "DeadCodeElimination": 0.0018534660339355469,
450
+ "DeadStoreElimination": 0.04532670974731445,
451
+ "DelinearIndices": 0.028018474578857422,
452
+ "Delinearization": 0.0051403045654296875,
453
+ "DelinearizeSPMD": 0.03557705879211426,
454
+ "DoNothing": 0.00012373924255371094,
455
+ "DramToDramTranspose": 0.030788660049438477,
456
+ "DumpGraphAndMetadata": 0.008297920227050781,
457
+ "EliminateDivs": 0.003348112106323242,
458
+ "ExpandBatchNorm": 0.002971172332763672,
459
+ "ExpandISAMacro": 0.007505178451538086,
460
+ "FactorizeBlkDims": 0.052065372467041016,
461
+ "FactorizeThreadAxesInFreeDims": 0.006781101226806641,
462
+ "FlattenMacroLoop": 0.006749868392944336,
463
+ "GenericAccessSimplifier": 0.0015370845794677734,
464
+ "InferInitValue": 0.13031220436096191,
465
+ "InferIntrinsicOnCC": 0.01256871223449707,
466
+ "InferNeuronTensor": 0.07101988792419434,
467
+ "InferNonlocalTensors": 0.0933828353881836,
468
+ "InferPSumTensor": 0.09560966491699219,
469
+ "InferShardAxis": 0.312000036239624,
470
+ "InferSharedMemLoc": 0.006642341613769531,
471
+ "InlineNativeKernels": 0.0033979415893554688,
472
+ "InsertCoreBarrier": 0.008008241653442383,
473
+ "InsertIOTransposes": 0.018876314163208008,
474
+ "InsertImplicitShardAxisBeforeISel": 0.016681194305419922,
475
+ "InsertLocalTransposes": 0.009229898452758789,
476
+ "InsertOffloadedTransposes": 0.05370330810546875,
477
+ "LICM": 0.007573604583740234,
478
+ "LateLegalizeInst": 0.01623988151550293,
479
+ "LateLegalizePostSplit": 0.007147073745727539,
480
+ "LateLowerReshapeOp": 0.0011415481567382813,
481
+ "LateLowerTensorOp": 0.0066013336181640625,
482
+ "LateNeuronInstComb": 0.12343692779541016,
483
+ "LayoutPreprocessing": 0.02958393096923828,
484
+ "LayoutPreprocessingAndAnalysis": 0.14548635482788086,
485
+ "LayoutRequirementAnalysis": 0.007357358932495117,
486
+ "LegalizeCCOpLayout": 0.0018928050994873047,
487
+ "LegalizeOpLevelAlias": 0.001081228256225586,
488
+ "LegalizePartitionReduce": 0.003218412399291992,
489
+ "LegalizeSundaAccess": 0.08743572235107422,
490
+ "LegalizeSundaMacro": 0.04705023765563965,
491
+ "LegalizeType": 0.009063720703125,
492
+ "LocalLayoutOpt": 0.017424583435058594,
493
+ "LoopFusion": 0.006888866424560547,
494
+ "LoopSplitting": 0.0018482208251953125,
495
+ "LowerBroadcast": 0.00490117073059082,
496
+ "LowerCCOpBlockAxis": 0.004808902740478516,
497
+ "LowerComplexBroadcast": 0.007742166519165039,
498
+ "LowerIntrinsics": 0.04466986656188965,
499
+ "LowerShardAxis": 0.008558988571166992,
500
+ "LowerTensorOp": 0.011698722839355469,
501
+ "LowerToSendRecv": 0.01171255111694336,
502
+ "LowerTranspose": 0.012961864471435547,
503
+ "MacroGeneration": 0.07335543632507324,
504
+ "MaskPropagation": 0.004875659942626953,
505
+ "MemcpyElimination": 0.19086575508117676,
506
+ "MutateDataType": 0.002115011215209961,
507
+ "NeuronAliasDependencyInduction": 0.0007119178771972656,
508
+ "NeuronAliasDependencyReset": 0.0555264949798584,
509
+ "NeuronInstComb": 0.03685903549194336,
510
+ "NeuronLICM": 0.02129840850830078,
511
+ "NeuronLoopFusion": 0.04936552047729492,
512
+ "NeuronLoopInterchange": 0.008442163467407227,
513
+ "NeuronSimplifier": 0.020423412322998047,
514
+ "NeuronSimplifyPredicates": 0.013469934463500977,
515
+ "NeuronValueNumbering": 0.011552095413208008,
516
+ "OptimizeAliasedCopyChain": 0.0006189346313476563,
517
+ "OptimizeNKIKernels": 0.0030050277709960938,
518
+ "PAGLayoutOpt": 0.4311056137084961,
519
+ "PComputeCutting": 0.008741617202758789,
520
+ "PGLayoutTilingPipeline": 1.7890496253967285,
521
+ "PGTiling": 0.33126235008239746,
522
+ "PadElimination": 0.0006849765777587891,
523
+ "ParAxesAnnotation": 0.3421931266784668,
524
+ "PartialLoopFusion": 0.05652737617492676,
525
+ "PartialSimdFusion": 0.04400372505187988,
526
+ "PerfectLoopNest": 0.007196664810180664,
527
+ "RecognizeOpIdiom": 0.003924369812011719,
528
+ "Recompute": 0.0004436969757080078,
529
+ "RelaxPredicates": 0.006342649459838867,
530
+ "Rematerialization": 0.006484508514404297,
531
+ "RemoveShardedPartitionAxes": 0.03604388236999512,
532
+ "ReshapeWeights": 0.002611398696899414,
533
+ "ResolveAccessConflict": 0.01564621925354004,
534
+ "ResolveComplicatePredicates": 0.0013320446014404297,
535
+ "RewriteReplicationMatmul": 0.008888483047485352,
536
+ "RewriteWeights": 0.005518674850463867,
537
+ "SFKVectorizer": 0.23942208290100098,
538
+ "ShardingPropagationAnalysis": 0.06231117248535156,
539
+ "SimpleAllReduceTiling": 0.008965253829956055,
540
+ "Simplifier": 0.009177446365356445,
541
+ "SimplifyMacroPredicates": 0.03521132469177246,
542
+ "SimplifyNeuronTensor": 0.022907257080078125,
543
+ "SimplifySlice": 0.001043081283569336,
544
+ "SimplifyTensor": 0.028610706329345703,
545
+ "SpillPSum": 0.041993141174316406,
546
+ "SplitAPUnionSets": 0.06584334373474121,
547
+ "SplitAccGrp": 0.005825042724609375,
548
+ "StaticProfiler": 0.013434648513793945,
549
+ "StaticTransposeLocalTensor": 0.008102178573608398,
550
+ "SundaISel": 0.12313151359558105,
551
+ "TCTransform": 0.0010597705841064453,
552
+ "TensorInitialization": 0.024387359619140625,
553
+ "TensorOpSimplifier": 0.006582498550415039,
554
+ "TensorOpTransform": 0.06252408027648926,
555
+ "TileCCOps": 0.016498565673828125,
556
+ "TilingProfiler": 0.06818985939025879,
557
+ "TransformConvOp": 0.0028336048126220703,
558
+ "TritiumFusion": 0.01378488540649414,
559
+ "ValueNumbering": 0.0024378299713134766,
560
+ "VectorizeDMA": 0.042115211486816406,
561
+ "VectorizeMatMult": 0.008977413177490234,
562
+ "WeightCoalescing": 0.005861759185791016,
563
+ "ZeroSizeTensorElimination": 0.00017881393432617188
564
+ },
565
+ "tensorizer": {
566
+ "DMATilingProfiler::TotalInstructionsAfterTiling": 416.0,
567
+ "StaticProfiler::AifUb": 5.140732288360596,
568
+ "StaticProfiler::ArithmeticIntensityTensorizer": 143.96510314941406,
569
+ "StaticProfiler::AverageDmaLength": 2013.53125,
570
+ "StaticProfiler::AverageFractalPeUtilization": 99.74824523925781,
571
+ "StaticProfiler::AveragePartitionUtilization": 99.1868667602539,
572
+ "StaticProfiler::AveragePeUtilization": 99.49378204345703,
573
+ "StaticProfiler::DDRTransferBytes": 16395014.0,
574
+ "StaticProfiler::InternalTransferBytes": 10682368.0,
575
+ "StaticProfiler::LoadExpanded": 3459.0,
576
+ "StaticProfiler::LocalizationEfficiency": 2800.478271484375,
577
+ "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 3271.21728515625,
578
+ "StaticProfiler::StoreExpanded": 1537.0,
579
+ "StaticProfiler::TotalDMAExpanded": 4996.0,
580
+ "StaticProfiler::TotalDynamicInstancesCount": 801.0,
581
+ "StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 800.0,
582
+ "StaticProfiler::TotalLNCComm": 0.0,
583
+ "StaticProfiler::TotalLNCCommTransfer": 0.0,
584
+ "TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0,
585
+ "TilingProfiler::AveragePeUtilizationAfterTiling": 0.0,
586
+ "TilingProfiler::BatchnormInstructionsAfterTiling": 0.0,
587
+ "TilingProfiler::DmaInstructionsAfterTiling": 0.0,
588
+ "TilingProfiler::GenericInstructionsAfterTiling": 10.0,
589
+ "TilingProfiler::MatMultInstructionsAfterTiling": 253.0,
590
+ "TilingProfiler::NumPfTransposes": 7.0,
591
+ "TilingProfiler::NumPfTransposesForIo": 0.0,
592
+ "TilingProfiler::NumPfTransposesForLocal": 5.0,
593
+ "TilingProfiler::NumPfTransposesForNonlocal": 2.0,
594
+ "TilingProfiler::PfTransposeInstructions": 56.0,
595
+ "TilingProfiler::PfTransposeInstructionsForIo": 0.0,
596
+ "TilingProfiler::PfTransposeInstructionsForLocal": 32.0,
597
+ "TilingProfiler::PfTransposeInstructionsForNonlocal": 24.0,
598
+ "TilingProfiler::ReduceInstructionsAfterTiling": 0.0,
599
+ "TilingProfiler::SimdInstructionsAfterTiling": 47.0,
600
+ "TilingProfiler::TotalInstructionsAfterTiling": 0.0,
601
+ "TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0,
602
+ "TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0,
603
+ "TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0,
604
+ "TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0,
605
+ "TransformConvOp::conv2d_column_packing": 0.0,
606
+ "TransformConvOp::conv2d_column_packing_1": 0.0,
607
+ "TransformConvOp::conv2d_column_packing_io10": 0.0,
608
+ "TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0
609
+ }
610
+ },
611
+ "sg0001": {
612
+ "compiletime": {
613
+ "AGOrderingAnalysisPass": 0.19573044776916504,
614
+ "AffinePredicateResolution": 0.0011768341064453125,
615
+ "AliasDependencyElimination": 0.00014972686767578125,
616
+ "AliasDependencyInduction": 0.02374124526977539,
617
+ "AliasDependencyReset": 0.05898928642272949,
618
+ "BFComputeCutting": 0.0019648075103759766,
619
+ "BirCodeGenLoop": 0.04745078086853027,
620
+ "CCOpFusion": 0.034403324127197266,
621
+ "CanonicalizeDAGForPGTiling": 0.013227224349975586,
622
+ "CanonicalizeIR": 0.0016665458679199219,
623
+ "CoalesceCCOp": 0.008426904678344727,
624
+ "CommuteConcat": 0.0011937618255615234,
625
+ "DMALocalityOpt": 0.0020418167114257813,
626
+ "DMAProfiler": 0.0212709903717041,
627
+ "DMATilingProfiler": 0.007970333099365234,
628
+ "DataLocalityOpt": 0.31763386726379395,
629
+ "DataStreaming": 0.013140678405761719,
630
+ "DeConcat": 0.006093025207519531,
631
+ "DeadCodeElimination": 0.0022492408752441406,
632
+ "DeadStoreElimination": 0.03447914123535156,
633
+ "DelinearIndices": 0.017621278762817383,
634
+ "Delinearization": 0.006613731384277344,
635
+ "DelinearizeSPMD": 0.036255598068237305,
636
+ "DoNothing": 9.298324584960938e-05,
637
+ "DramToDramTranspose": 0.011357545852661133,
638
+ "DumpGraphAndMetadata": 0.0038836002349853516,
639
+ "EliminateDivs": 0.007913589477539063,
640
+ "ExpandBatchNorm": 0.0027163028717041016,
641
+ "ExpandISAMacro": 0.006444692611694336,
642
+ "FactorizeBlkDims": 0.023404359817504883,
643
+ "FactorizeThreadAxesInFreeDims": 0.011568069458007813,
644
+ "FlattenMacroLoop": 0.012357473373413086,
645
+ "GenericAccessSimplifier": 0.0020608901977539063,
646
+ "InferInitValue": 0.10583114624023438,
647
+ "InferIntrinsicOnCC": 0.00994729995727539,
648
+ "InferNeuronTensor": 0.04976606369018555,
649
+ "InferNonlocalTensors": 0.04819130897521973,
650
+ "InferPSumTensor": 0.0679934024810791,
651
+ "InferShardAxis": 0.6268763542175293,
652
+ "InferSharedMemLoc": 0.005129814147949219,
653
+ "InlineNativeKernels": 0.009308338165283203,
654
+ "InsertCoreBarrier": 0.00969243049621582,
655
+ "InsertIOTransposes": 0.03561210632324219,
656
+ "InsertImplicitShardAxisBeforeISel": 0.017783164978027344,
657
+ "InsertLocalTransposes": 0.012435436248779297,
658
+ "InsertOffloadedTransposes": 0.008218526840209961,
659
+ "LICM": 0.011756420135498047,
660
+ "LateLegalizeInst": 0.012684106826782227,
661
+ "LateLegalizePostSplit": 0.0054225921630859375,
662
+ "LateLowerReshapeOp": 0.002172231674194336,
663
+ "LateLowerTensorOp": 0.003939151763916016,
664
+ "LateNeuronInstComb": 0.07796549797058105,
665
+ "LayoutPreprocessing": 0.09417939186096191,
666
+ "LayoutPreprocessingAndAnalysis": 0.15397191047668457,
667
+ "LayoutRequirementAnalysis": 0.03167152404785156,
668
+ "LegalizeCCOpLayout": 0.001916646957397461,
669
+ "LegalizeOpLevelAlias": 0.00103759765625,
670
+ "LegalizePartitionReduce": 0.002568960189819336,
671
+ "LegalizeSundaAccess": 0.03490424156188965,
672
+ "LegalizeSundaMacro": 0.04486250877380371,
673
+ "LegalizeType": 0.010438203811645508,
674
+ "LocalLayoutOpt": 0.037950992584228516,
675
+ "LoopFusion": 0.00687098503112793,
676
+ "LoopSplitting": 0.002494335174560547,
677
+ "LowerBroadcast": 0.0028448104858398438,
678
+ "LowerCCOpBlockAxis": 0.016790151596069336,
679
+ "LowerComplexBroadcast": 0.003789663314819336,
680
+ "LowerIntrinsics": 0.06158947944641113,
681
+ "LowerShardAxis": 0.009115934371948242,
682
+ "LowerTensorOp": 0.011396646499633789,
683
+ "LowerToSendRecv": 0.00603795051574707,
684
+ "LowerTranspose": 0.030293703079223633,
685
+ "MacroGeneration": 0.14122748374938965,
686
+ "MaskPropagation": 0.007950544357299805,
687
+ "MemcpyElimination": 0.18889641761779785,
688
+ "MutateDataType": 0.0014033317565917969,
689
+ "NeuronAliasDependencyInduction": 0.0007326602935791016,
690
+ "NeuronAliasDependencyReset": 0.025636672973632813,
691
+ "NeuronInstComb": 0.0452880859375,
692
+ "NeuronLICM": 0.027920246124267578,
693
+ "NeuronLoopFusion": 0.07481861114501953,
694
+ "NeuronLoopInterchange": 0.004810810089111328,
695
+ "NeuronSimplifier": 0.027257442474365234,
696
+ "NeuronSimplifyPredicates": 0.011795282363891602,
697
+ "NeuronValueNumbering": 0.013232946395874023,
698
+ "OptimizeAliasedCopyChain": 0.000640869140625,
699
+ "OptimizeNKIKernels": 0.007096529006958008,
700
+ "PAGLayoutOpt": 0.25133657455444336,
701
+ "PComputeCutting": 0.02008199691772461,
702
+ "PGLayoutTilingPipeline": 2.1073567867279053,
703
+ "PGTiling": 0.5283112525939941,
704
+ "PadElimination": 0.0005664825439453125,
705
+ "ParAxesAnnotation": 0.16274571418762207,
706
+ "PartialLoopFusion": 0.07154703140258789,
707
+ "PartialSimdFusion": 0.05425691604614258,
708
+ "PerfectLoopNest": 0.007505655288696289,
709
+ "RecognizeOpIdiom": 0.004193305969238281,
710
+ "Recompute": 0.0005002021789550781,
711
+ "RelaxPredicates": 0.0031478404998779297,
712
+ "Rematerialization": 0.002758502960205078,
713
+ "RemoveShardedPartitionAxes": 0.05587267875671387,
714
+ "ReshapeWeights": 0.0015969276428222656,
715
+ "ResolveAccessConflict": 0.021365642547607422,
716
+ "ResolveComplicatePredicates": 0.0011401176452636719,
717
+ "RewriteReplicationMatmul": 0.0025501251220703125,
718
+ "RewriteWeights": 0.014093399047851563,
719
+ "SFKVectorizer": 0.51774001121521,
720
+ "ShardingPropagationAnalysis": 0.030755996704101563,
721
+ "SimpleAllReduceTiling": 0.003780364990234375,
722
+ "Simplifier": 0.006270885467529297,
723
+ "SimplifyMacroPredicates": 0.01894402503967285,
724
+ "SimplifyNeuronTensor": 0.036655426025390625,
725
+ "SimplifySlice": 0.0019352436065673828,
726
+ "SimplifyTensor": 0.033560752868652344,
727
+ "SpillPSum": 0.03554582595825195,
728
+ "SplitAPUnionSets": 0.039057016372680664,
729
+ "SplitAccGrp": 0.002908468246459961,
730
+ "StaticProfiler": 0.009857654571533203,
731
+ "StaticTransposeLocalTensor": 0.014261007308959961,
732
+ "SundaISel": 0.07885026931762695,
733
+ "TCTransform": 0.0012857913970947266,
734
+ "TensorInitialization": 0.011929512023925781,
735
+ "TensorOpSimplifier": 0.007134199142456055,
736
+ "TensorOpTransform": 0.05220603942871094,
737
+ "TileCCOps": 0.006574392318725586,
738
+ "TilingProfiler": 0.03860926628112793,
739
+ "TransformConvOp": 0.002733469009399414,
740
+ "TritiumFusion": 0.08646178245544434,
741
+ "ValueNumbering": 0.003155946731567383,
742
+ "VectorizeDMA": 0.029859304428100586,
743
+ "VectorizeMatMult": 0.011672019958496094,
744
+ "WeightCoalescing": 0.004624366760253906,
745
+ "ZeroSizeTensorElimination": 0.0002124309539794922
746
+ },
747
+ "tensorizer": {
748
+ "DMATilingProfiler::TotalInstructionsAfterTiling": 1427.0,
749
+ "StaticProfiler::AifUb": 40.19935607910156,
750
+ "StaticProfiler::ArithmeticIntensityTensorizer": 134.3648223876953,
751
+ "StaticProfiler::AverageDmaLength": 4238.58251953125,
752
+ "StaticProfiler::AverageFractalPeUtilization": 100.0,
753
+ "StaticProfiler::AveragePartitionUtilization": 99.61003112792969,
754
+ "StaticProfiler::AveragePeUtilization": 100.0,
755
+ "StaticProfiler::DDRTransferBytes": 55879176.0,
756
+ "StaticProfiler::InternalTransferBytes": 9895936.0,
757
+ "StaticProfiler::LoadExpanded": 9729.0,
758
+ "StaticProfiler::LocalizationEfficiency": 334.2462158203125,
759
+ "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 357.74188232421875,
760
+ "StaticProfiler::StoreExpanded": 769.0,
761
+ "StaticProfiler::TotalDMAExpanded": 10498.0,
762
+ "StaticProfiler::TotalDynamicInstancesCount": 1799.0,
763
+ "StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 1799.0,
764
+ "StaticProfiler::TotalLNCComm": 0.0,
765
+ "StaticProfiler::TotalLNCCommTransfer": 0.0,
766
+ "TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0,
767
+ "TilingProfiler::AveragePeUtilizationAfterTiling": 0.0,
768
+ "TilingProfiler::BatchnormInstructionsAfterTiling": 0.0,
769
+ "TilingProfiler::DmaInstructionsAfterTiling": 0.0,
770
+ "TilingProfiler::GenericInstructionsAfterTiling": 8.0,
771
+ "TilingProfiler::MatMultInstructionsAfterTiling": 1116.0,
772
+ "TilingProfiler::NumPfTransposes": 8.0,
773
+ "TilingProfiler::NumPfTransposesForIo": 3.0,
774
+ "TilingProfiler::NumPfTransposesForLocal": 3.0,
775
+ "TilingProfiler::NumPfTransposesForNonlocal": 2.0,
776
+ "TilingProfiler::PfTransposeInstructions": 66.0,
777
+ "TilingProfiler::PfTransposeInstructionsForIo": 18.0,
778
+ "TilingProfiler::PfTransposeInstructionsForLocal": 16.0,
779
+ "TilingProfiler::PfTransposeInstructionsForNonlocal": 32.0,
780
+ "TilingProfiler::ReduceInstructionsAfterTiling": 0.0,
781
+ "TilingProfiler::SimdInstructionsAfterTiling": 87.0,
782
+ "TilingProfiler::TotalInstructionsAfterTiling": 0.0,
783
+ "TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0,
784
+ "TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0,
785
+ "TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0,
786
+ "TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0,
787
+ "TransformConvOp::conv2d_column_packing": 0.0,
788
+ "TransformConvOp::conv2d_column_packing_1": 0.0,
789
+ "TransformConvOp::conv2d_column_packing_io10": 0.0,
790
+ "TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0
791
+ }
792
+ },
793
+ "sg0002": {
794
+ "compiletime": {
795
+ "AGOrderingAnalysisPass": 0.037471771240234375,
796
+ "AffinePredicateResolution": 0.0048100948333740234,
797
+ "AliasDependencyElimination": 0.0002529621124267578,
798
+ "AliasDependencyInduction": 0.005568504333496094,
799
+ "AliasDependencyReset": 0.11161017417907715,
800
+ "BFComputeCutting": 0.0024290084838867188,
801
+ "BirCodeGenLoop": 0.32352304458618164,
802
+ "CCOpFusion": 0.033486366271972656,
803
+ "CanonicalizeDAGForPGTiling": 0.004197120666503906,
804
+ "CanonicalizeIR": 0.0025298595428466797,
805
+ "CoalesceCCOp": 0.007080078125,
806
+ "CommuteConcat": 0.0018744468688964844,
807
+ "DMALocalityOpt": 0.0021386146545410156,
808
+ "DMAProfiler": 0.01854729652404785,
809
+ "DMATilingProfiler": 0.015254497528076172,
810
+ "DataLocalityOpt": 0.1120154857635498,
811
+ "DataStreaming": 0.007681369781494141,
812
+ "DeConcat": 0.0022406578063964844,
813
+ "DeadCodeElimination": 0.0021486282348632813,
814
+ "DeadStoreElimination": 0.0063364505767822266,
815
+ "DelinearIndices": 0.0064697265625,
816
+ "Delinearization": 0.004486560821533203,
817
+ "DelinearizeSPMD": 0.01732611656188965,
818
+ "DoNothing": 9.441375732421875e-05,
819
+ "DramToDramTranspose": 0.02082037925720215,
820
+ "DumpGraphAndMetadata": 0.036411285400390625,
821
+ "EliminateDivs": 0.01006174087524414,
822
+ "ExpandBatchNorm": 0.0024886131286621094,
823
+ "ExpandISAMacro": 0.007379293441772461,
824
+ "FactorizeBlkDims": 0.023633480072021484,
825
+ "FactorizeThreadAxesInFreeDims": 0.0071103572845458984,
826
+ "FlattenMacroLoop": 0.009794235229492188,
827
+ "GenericAccessSimplifier": 0.0009224414825439453,
828
+ "InferInitValue": 0.12128233909606934,
829
+ "InferIntrinsicOnCC": 0.01005697250366211,
830
+ "InferNeuronTensor": 0.029047489166259766,
831
+ "InferNonlocalTensors": 0.017493009567260742,
832
+ "InferPSumTensor": 0.04303455352783203,
833
+ "InferShardAxis": 0.26027798652648926,
834
+ "InferSharedMemLoc": 0.012881040573120117,
835
+ "InlineNativeKernels": 0.002816915512084961,
836
+ "InsertCoreBarrier": 0.009889602661132813,
837
+ "InsertIOTransposes": 0.019797325134277344,
838
+ "InsertImplicitShardAxisBeforeISel": 0.05061173439025879,
839
+ "InsertLocalTransposes": 0.004299163818359375,
840
+ "InsertOffloadedTransposes": 0.008011579513549805,
841
+ "LICM": 0.009003639221191406,
842
+ "LateLegalizeInst": 0.013794183731079102,
843
+ "LateLegalizePostSplit": 0.013758182525634766,
844
+ "LateLowerReshapeOp": 0.0012693405151367188,
845
+ "LateLowerTensorOp": 0.002027750015258789,
846
+ "LateNeuronInstComb": 0.09844541549682617,
847
+ "LayoutPreprocessing": 0.025156497955322266,
848
+ "LayoutPreprocessingAndAnalysis": 0.06950831413269043,
849
+ "LayoutRequirementAnalysis": 0.0069408416748046875,
850
+ "LegalizeCCOpLayout": 0.003494739532470703,
851
+ "LegalizeOpLevelAlias": 0.0016810894012451172,
852
+ "LegalizePartitionReduce": 0.0026693344116210938,
853
+ "LegalizeSundaAccess": 0.0380399227142334,
854
+ "LegalizeSundaMacro": 0.10486245155334473,
855
+ "LegalizeType": 0.015400409698486328,
856
+ "LocalLayoutOpt": 0.012215137481689453,
857
+ "LoopFusion": 0.0049479007720947266,
858
+ "LoopSplitting": 0.0008144378662109375,
859
+ "LowerBroadcast": 0.0033435821533203125,
860
+ "LowerCCOpBlockAxis": 0.0037145614624023438,
861
+ "LowerComplexBroadcast": 0.0070230960845947266,
862
+ "LowerIntrinsics": 0.08174729347229004,
863
+ "LowerShardAxis": 0.020240068435668945,
864
+ "LowerTensorOp": 0.028459787368774414,
865
+ "LowerToSendRecv": 0.02129983901977539,
866
+ "LowerTranspose": 0.05583548545837402,
867
+ "MacroGeneration": 0.03631877899169922,
868
+ "MaskPropagation": 0.004620075225830078,
869
+ "MemcpyElimination": 0.04741477966308594,
870
+ "MutateDataType": 0.002264261245727539,
871
+ "NeuronAliasDependencyInduction": 0.002180337905883789,
872
+ "NeuronAliasDependencyReset": 0.08514618873596191,
873
+ "NeuronInstComb": 0.017351865768432617,
874
+ "NeuronLICM": 0.015241861343383789,
875
+ "NeuronLoopFusion": 0.05364656448364258,
876
+ "NeuronLoopInterchange": 0.002526521682739258,
877
+ "NeuronSimplifier": 0.06896662712097168,
878
+ "NeuronSimplifyPredicates": 0.023428916931152344,
879
+ "NeuronValueNumbering": 0.009569168090820313,
880
+ "OptimizeAliasedCopyChain": 0.0007548332214355469,
881
+ "OptimizeNKIKernels": 4.075549602508545,
882
+ "PAGLayoutOpt": 0.1111152172088623,
883
+ "PComputeCutting": 0.005707263946533203,
884
+ "PGLayoutTilingPipeline": 1.204958438873291,
885
+ "PGTiling": 0.4116194248199463,
886
+ "PadElimination": 0.0003600120544433594,
887
+ "ParAxesAnnotation": 0.050878286361694336,
888
+ "PartialLoopFusion": 0.0372469425201416,
889
+ "PartialSimdFusion": 0.021113157272338867,
890
+ "PerfectLoopNest": 0.007718086242675781,
891
+ "RecognizeOpIdiom": 0.0058002471923828125,
892
+ "Recompute": 0.0017511844635009766,
893
+ "RelaxPredicates": 0.00795745849609375,
894
+ "Rematerialization": 0.0019276142120361328,
895
+ "RemoveShardedPartitionAxes": 0.008410930633544922,
896
+ "ReshapeWeights": 0.0063934326171875,
897
+ "ResolveAccessConflict": 0.01411294937133789,
898
+ "ResolveComplicatePredicates": 0.004876375198364258,
899
+ "RewriteReplicationMatmul": 0.0017600059509277344,
900
+ "RewriteWeights": 0.004542827606201172,
901
+ "SFKVectorizer": 0.23946118354797363,
902
+ "ShardingPropagationAnalysis": 0.06259655952453613,
903
+ "SimpleAllReduceTiling": 0.004370212554931641,
904
+ "Simplifier": 0.0033507347106933594,
905
+ "SimplifyMacroPredicates": 0.056143999099731445,
906
+ "SimplifyNeuronTensor": 0.020067691802978516,
907
+ "SimplifySlice": 0.001861572265625,
908
+ "SimplifyTensor": 0.02954578399658203,
909
+ "SpillPSum": 0.03782367706298828,
910
+ "SplitAPUnionSets": 0.07312703132629395,
911
+ "SplitAccGrp": 0.002663135528564453,
912
+ "StaticProfiler": 0.02257680892944336,
913
+ "StaticTransposeLocalTensor": 0.003572225570678711,
914
+ "SundaISel": 0.10315561294555664,
915
+ "TCTransform": 0.0025663375854492188,
916
+ "TensorInitialization": 0.00860285758972168,
917
+ "TensorOpSimplifier": 0.008630037307739258,
918
+ "TensorOpTransform": 0.028581619262695313,
919
+ "TileCCOps": 0.00518488883972168,
920
+ "TilingProfiler": 0.023342609405517578,
921
+ "TransformConvOp": 0.008756637573242188,
922
+ "TritiumFusion": 0.13446974754333496,
923
+ "ValueNumbering": 0.003237485885620117,
924
+ "VectorizeDMA": 0.028183698654174805,
925
+ "VectorizeMatMult": 0.015199661254882813,
926
+ "WeightCoalescing": 0.0020062923431396484,
927
+ "ZeroSizeTensorElimination": 0.0001671314239501953
928
+ },
929
+ "tensorizer": {
930
+ "DMATilingProfiler::TotalInstructionsAfterTiling": 20773.0,
931
+ "StaticProfiler::AifUb": 131.73849487304688,
932
+ "StaticProfiler::ArithmeticIntensityTensorizer": 127.46285247802734,
933
+ "StaticProfiler::AverageDmaLength": 2400.2490234375,
934
+ "StaticProfiler::AverageFractalPeUtilization": 98.70232391357422,
935
+ "StaticProfiler::AveragePartitionUtilization": 94.02606201171875,
936
+ "StaticProfiler::AveragePeUtilization": 96.57791900634766,
937
+ "StaticProfiler::DDRTransferBytes": 361746464.0,
938
+ "StaticProfiler::InternalTransferBytes": 320526112.0,
939
+ "StaticProfiler::LoadExpanded": 84060.0,
940
+ "StaticProfiler::LocalizationEfficiency": 96.75444030761719,
941
+ "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 99.23246002197266,
942
+ "StaticProfiler::StoreExpanded": 1898.0,
943
+ "StaticProfiler::TotalDMAExpanded": 85958.0,
944
+ "StaticProfiler::TotalDynamicInstancesCount": 25131.0,
945
+ "StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 24680.0,
946
+ "StaticProfiler::TotalLNCComm": 0.0,
947
+ "StaticProfiler::TotalLNCCommTransfer": 0.0,
948
+ "TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0,
949
+ "TilingProfiler::AveragePeUtilizationAfterTiling": 0.0,
950
+ "TilingProfiler::BatchnormInstructionsAfterTiling": 0.0,
951
+ "TilingProfiler::DmaInstructionsAfterTiling": 0.0,
952
+ "TilingProfiler::GenericInstructionsAfterTiling": 4.0,
953
+ "TilingProfiler::MatMultInstructionsAfterTiling": 10368.0,
954
+ "TilingProfiler::NumPfTransposes": 6.0,
955
+ "TilingProfiler::NumPfTransposesForIo": 1.0,
956
+ "TilingProfiler::NumPfTransposesForLocal": 1.0,
957
+ "TilingProfiler::NumPfTransposesForNonlocal": 4.0,
958
+ "TilingProfiler::PfTransposeInstructions": 10147.0,
959
+ "TilingProfiler::PfTransposeInstructionsForIo": 9504.0,
960
+ "TilingProfiler::PfTransposeInstructionsForLocal": 1.0,
961
+ "TilingProfiler::PfTransposeInstructionsForNonlocal": 642.0,
962
+ "TilingProfiler::ReduceInstructionsAfterTiling": 4.0,
963
+ "TilingProfiler::SimdInstructionsAfterTiling": 92.0,
964
+ "TilingProfiler::TotalInstructionsAfterTiling": 0.0,
965
+ "TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0,
966
+ "TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0,
967
+ "TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0,
968
+ "TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0,
969
+ "TransformConvOp::conv2d_column_packing": 0.0,
970
+ "TransformConvOp::conv2d_column_packing_1": 0.0,
971
+ "TransformConvOp::conv2d_column_packing_io10": 0.0,
972
+ "TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0
973
+ }
974
+ },
975
+ "sg01": {
976
+ "compiletime": {
977
+ "CanonicalizeConv": 1.8000000636675395e-05,
978
+ "CanonicalizeForTensorizer": 1.1000000085914508e-05,
979
+ "Canonicalizer": 0.0002209999947808683,
980
+ "HoistCompute": 3.999999989900971e-06,
981
+ "IdentifyCrossPassTensors": 1.2000000424450263e-05,
982
+ "MemcastMotion": 7.999999979801942e-06,
983
+ "PenguinizeFunctions": 9.000000318337698e-06,
984
+ "PruneFunctions": 1.2000000424450263e-05,
985
+ "RemoveOptimizationBarriers": 2.9000000722589903e-05,
986
+ "ScatterMotion": 3.099999958067201e-05,
987
+ "TensorizerLegalizationPass": 1.2000000424450263e-05,
988
+ "VerifySupportedOps": 9.999999747378752e-06,
989
+ "algsimp": 4.999999873689376e-05,
990
+ "batchnorm_expander": 1.2000000424450263e-05,
991
+ "boundary-marker-removal": 3.999999989900971e-06,
992
+ "call-inliner": 7.999999979801942e-06,
993
+ "canonicalize-boundary-marker": 4.999999873689376e-06,
994
+ "collective-stream-id-checker": 3.000000106112566e-06,
995
+ "comparison-expander": 4.999999873689376e-06,
996
+ "computation-deduplicator": 1.700000029813964e-05,
997
+ "config-lowering": 0.00010399999882793054,
998
+ "constant_folding": 7.999999979801942e-06,
999
+ "cse": 1.1000000085914508e-05,
1000
+ "dce": 9.999999974752427e-07,
1001
+ "dynamic-slice-transpose": 3.999999989900971e-06,
1002
+ "eliminate-redundant-compare": 3.999999989900971e-06,
1003
+ "emit-offloaded-dropout": 2.499999936844688e-05,
1004
+ "flatten-call-graph": 7.000000096013537e-06,
1005
+ "fuse-send-recv": 1.8999999156221747e-05,
1006
+ "hilo-conditional-to-select": 3.999999989900971e-06,
1007
+ "hilo::LegalizeAlias": 3.999999989900971e-06,
1008
+ "hilo::NeuronInstCombine": 5.2999999752501026e-05,
1009
+ "hilo::NeuronOpFusion": 3.899999865097925e-05,
1010
+ "hilo::ReplaceTokenTypeWithU8Pass": 7.000000096013537e-06,
1011
+ "hilo::ScheduleFusion": 9.999999974752427e-07,
1012
+ "hilo::SixtyFourHack": 9.000000318337698e-06,
1013
+ "hilo::VerifyAliasing": 9.999999974752427e-07,
1014
+ "hlo-mac-count": 1.8999999156221747e-05,
1015
+ "legalize-ccops-for-tensorizer": 9.999999974752427e-07,
1016
+ "legalize-compare": 3.999999989900971e-06,
1017
+ "lower-argminmax-custom-call": 3.000000106112566e-06,
1018
+ "map-inline": 9.999999747378752e-06,
1019
+ "metadata-naming": 1.700000029813964e-05,
1020
+ "mlir::detail::OpToOpPassAdaptor": 1.9999999494757503e-05,
1021
+ "mlir::hlo::MhloToPyPenguin": 0.0009130000253207982,
1022
+ "mlir::mhlo::LowerComplexExtraPass": 7.200000254670158e-05,
1023
+ "mlir::mhlo::LowerComplexPass": 0.0001250000059371814,
1024
+ "native-to-custom-softmax": 4.999999873689376e-06,
1025
+ "native-to-custom-softmax-dx": 1.1000000085914508e-05,
1026
+ "neuron-hlo-verifier": 0.00036299999919719994,
1027
+ "operand_upcaster": 1.4000000192027073e-05,
1028
+ "post-par-pipe-begin": 1.9999999949504854e-06,
1029
+ "post-par-pipe-end": 0.0,
1030
+ "post-partition-simplification": 0.0004330000083427876,
1031
+ "replace-minimum-constant": 4.999999873689376e-06,
1032
+ "reshape-mover": 3.000000106112566e-06,
1033
+ "simplify-concat": 3.7000001611886546e-05,
1034
+ "simplify-while-loops": 1.9999999949504854e-06,
1035
+ "transform-variadic-reduce": 7.000000096013537e-06,
1036
+ "tuple-simplifier": 3.999999989900971e-06,
1037
+ "unpack-nested-aws-ntwsr": 3.000000106112566e-06,
1038
+ "unroll-while-loop": 0.0
1039
+ },
1040
+ "hilo": {
1041
+ "ArithmeticIntensity": 53.940223693847656,
1042
+ "HloMacCount": 3254779904.0,
1043
+ "Traffic": 120680992.0
1044
+ }
1045
+ },
1046
+ "sg02": {
1047
+ "compiletime": {
1048
+ "CanonicalizeConv": 1.9999999949504854e-06,
1049
+ "CanonicalizeForTensorizer": 1.2000000424450263e-05,
1050
+ "Canonicalizer": 0.0003380000125616789,
1051
+ "HoistCompute": 9.999999974752427e-07,
1052
+ "IdentifyCrossPassTensors": 9.999999747378752e-06,
1053
+ "MemcastMotion": 1.1000000085914508e-05,
1054
+ "PenguinizeFunctions": 7.999999979801942e-06,
1055
+ "PruneFunctions": 7.000000096013537e-06,
1056
+ "RemoveOptimizationBarriers": 2.300000051036477e-05,
1057
+ "ScatterMotion": 4.999999873689376e-06,
1058
+ "TensorizerLegalizationPass": 6.000000212225132e-06,
1059
+ "VerifySupportedOps": 1.2000000424450263e-05,
1060
+ "algsimp": 4.8000001697801054e-05,
1061
+ "batchnorm_expander": 1.1000000085914508e-05,
1062
+ "boundary-marker-removal": 3.999999989900971e-06,
1063
+ "call-inliner": 9.000000318337698e-06,
1064
+ "canonicalize-boundary-marker": 4.999999873689376e-06,
1065
+ "collective-stream-id-checker": 1.9999999949504854e-06,
1066
+ "comparison-expander": 4.999999873689376e-06,
1067
+ "computation-deduplicator": 1.8000000636675395e-05,
1068
+ "config-lowering": 3.600000127335079e-05,
1069
+ "constant_folding": 7.000000096013537e-06,
1070
+ "cse": 1.4000000192027073e-05,
1071
+ "dce": 9.999999974752427e-07,
1072
+ "dynamic-slice-transpose": 3.999999989900971e-06,
1073
+ "eliminate-redundant-compare": 3.000000106112566e-06,
1074
+ "emit-offloaded-dropout": 1.4000000192027073e-05,
1075
+ "flatten-call-graph": 9.999999747378752e-06,
1076
+ "fuse-send-recv": 1.4999999621068127e-05,
1077
+ "hilo-conditional-to-select": 6.000000212225132e-06,
1078
+ "hilo::LegalizeAlias": 1.9999999949504854e-06,
1079
+ "hilo::NeuronInstCombine": 1.2000000424450263e-05,
1080
+ "hilo::NeuronOpFusion": 1.1000000085914508e-05,
1081
+ "hilo::ReplaceTokenTypeWithU8Pass": 1.2999999853491317e-05,
1082
+ "hilo::ScheduleFusion": 3.999999989900971e-06,
1083
+ "hilo::SixtyFourHack": 3.899999865097925e-05,
1084
+ "hilo::VerifyAliasing": 9.999999974752427e-07,
1085
+ "hlo-mac-count": 0.004476999863982201,
1086
+ "legalize-ccops-for-tensorizer": 9.999999974752427e-07,
1087
+ "legalize-compare": 3.000000106112566e-06,
1088
+ "lower-argminmax-custom-call": 3.999999989900971e-06,
1089
+ "map-inline": 1.1000000085914508e-05,
1090
+ "metadata-naming": 1.2999999853491317e-05,
1091
+ "mlir::detail::OpToOpPassAdaptor": 2.9999999242136255e-05,
1092
+ "mlir::hlo::MhloToPyPenguin": 0.004188999999314547,
1093
+ "mlir::mhlo::LowerComplexExtraPass": 9.000000136438757e-05,
1094
+ "mlir::mhlo::LowerComplexPass": 0.000155999994603917,
1095
+ "native-to-custom-softmax": 4.999999873689376e-06,
1096
+ "native-to-custom-softmax-dx": 1.8000000636675395e-05,
1097
+ "neuron-hlo-verifier": 0.00033400001120753586,
1098
+ "operand_upcaster": 1.1000000085914508e-05,
1099
+ "post-par-pipe-begin": 9.999999974752427e-07,
1100
+ "post-par-pipe-end": 0.0,
1101
+ "post-partition-simplification": 0.0004529999860096723,
1102
+ "replace-minimum-constant": 7.000000096013537e-06,
1103
+ "reshape-mover": 3.000000106112566e-06,
1104
+ "simplify-concat": 3.7999998312443495e-05,
1105
+ "simplify-while-loops": 1.9999999949504854e-06,
1106
+ "transform-variadic-reduce": 4.199999966658652e-05,
1107
+ "tuple-simplifier": 3.999999989900971e-06,
1108
+ "unpack-nested-aws-ntwsr": 3.999999989900971e-06,
1109
+ "unroll-while-loop": 0.0
1110
+ },
1111
+ "hilo": {
1112
+ "ArithmeticIntensity": 14.666111946105957,
1113
+ "HloMacCount": 2572550144.0,
1114
+ "Traffic": 350815552.0
1115
+ }
1116
+ },
1117
+ "topk": {
1118
+ "compiletime": {
1119
+ "CoalesceCCOp": 0.006727457046508789,
1120
+ "DMALocalityOpt": 0.009476661682128906,
1121
+ "DMAProfiler": 0.006308317184448242,
1122
+ "DataStreaming": 0.029163122177124023,
1123
+ "DoNothing": 0.0004937648773193359,
1124
+ "ExpandISAMacro": 0.006926536560058594,
1125
+ "FactorizeBlkDims": 0.049018144607543945,
1126
+ "InferPSumTensor": 0.049260616302490234,
1127
+ "InferSharedMemLoc": 0.003329038619995117,
1128
+ "InsertCoreBarrier": 0.0059740543365478516,
1129
+ "LateLegalizeInst": 0.019405364990234375,
1130
+ "LateNeuronInstComb": 0.04540205001831055,
1131
+ "LegalizeSundaAccess": 0.046309709548950195,
1132
+ "LegalizeType": 0.05346846580505371,
1133
+ "LowerBroadcast": 0.015480279922485352,
1134
+ "LowerIntrinsics": 0.007883310317993164,
1135
+ "LowerTranspose": 0.010731220245361328,
1136
+ "NeuronInstComb": 0.03727889060974121,
1137
+ "NeuronLICM": 0.03041553497314453,
1138
+ "NeuronSimplifyPredicates": 0.006567955017089844,
1139
+ "NeuronValueNumbering": 0.015464067459106445,
1140
+ "SFKVectorizer": 0.07225155830383301,
1141
+ "SimpleAllReduceTiling": 0.006035804748535156,
1142
+ "SimplifyNeuronTensor": 0.11353325843811035,
1143
+ "SpillPSum": 0.0760800838470459,
1144
+ "WeightCoalescing": 0.01406407356262207
1145
+ }
1146
+ }
1147
+ }
context_encoding_model/_tp0_bk0/graph.neff ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:98834cf4cd3214e9f9fc84530eed5ef31b01fda5919c60b959ca4a30bcb80d0c
3
+ size 1188864
context_encoding_model/_tp0_bk0/log-neuron-cc.txt ADDED
The diff for this file is too large to render. See raw diff
 
context_encoding_model/_tp0_bk0/metaneff.pb ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ec75ee80b2ec3909e8e315fa6044902ec93fdb3a62229b909f551426d04c56b6
3
+ size 2077993
context_encoding_model/_tp0_bk0/model.MODULE_e7e1b6c43bb87ca73ecc+2ee9f01d.hlo_module.pb ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b77f309407f7c741dd9b51614fc850fa657ce4e6ca40a18b4471f2b477760976
3
+ size 2163092
context_encoding_model/_tp0_bk0/model.MODULE_e7e1b6c43bb87ca73ecc+2ee9f01d.neff ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:98834cf4cd3214e9f9fc84530eed5ef31b01fda5919c60b959ca4a30bcb80d0c
3
+ size 1188864
context_encoding_model/_tp0_bk0/neuron_config.json ADDED
@@ -0,0 +1,224 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_attn_implementation_autoset": false,
3
+ "_name_or_path": "/home/ubuntu/models/Qwen3-1.7B",
4
+ "add_cross_attention": false,
5
+ "architectures": [
6
+ "Qwen3ForCausalLM"
7
+ ],
8
+ "attention_bias": false,
9
+ "attention_dropout": 0.0,
10
+ "attribute_map": {},
11
+ "bad_words_ids": null,
12
+ "begin_suppress_tokens": null,
13
+ "bos_token_id": 151643,
14
+ "chunk_size_feed_forward": 0,
15
+ "cross_attention_hidden_size": null,
16
+ "decoder_start_token_id": null,
17
+ "diversity_penalty": 0.0,
18
+ "do_sample": false,
19
+ "early_stopping": false,
20
+ "encoder_no_repeat_ngram_size": 0,
21
+ "eos_token_id": 151645,
22
+ "exponential_decay_length_penalty": null,
23
+ "finetuning_task": null,
24
+ "forced_bos_token_id": null,
25
+ "forced_eos_token_id": null,
26
+ "fused_spec_config": null,
27
+ "head_dim": 128,
28
+ "hidden_act": "silu",
29
+ "hidden_size": 2048,
30
+ "id2label": {
31
+ "0": "LABEL_0",
32
+ "1": "LABEL_1"
33
+ },
34
+ "initializer_range": 0.02,
35
+ "intermediate_size": 6144,
36
+ "is_decoder": false,
37
+ "is_encoder_decoder": false,
38
+ "label2id": {
39
+ "LABEL_0": 0,
40
+ "LABEL_1": 1
41
+ },
42
+ "length_penalty": 1.0,
43
+ "max_length": 20,
44
+ "max_position_embeddings": 40960,
45
+ "max_window_layers": 28,
46
+ "metadata": null,
47
+ "min_length": 0,
48
+ "model_type": "qwen3",
49
+ "neuron_config": {
50
+ "activation_quantization_type": null,
51
+ "allow_input_truncation": false,
52
+ "apply_seq_ids_mask": false,
53
+ "async_mode": false,
54
+ "attention_dp_degree": 1,
55
+ "attention_dtype": null,
56
+ "attn_block_cte_nki_kernel_enabled": false,
57
+ "attn_block_tkg_nki_kernel_cache_update": false,
58
+ "attn_block_tkg_nki_kernel_cascaded_attention": false,
59
+ "attn_block_tkg_nki_kernel_enabled": false,
60
+ "attn_cls": {
61
+ "__module__": "neuronx_distributed_inference.models.qwen3.modeling_qwen3",
62
+ "__name__": "NeuronQwen3Attention"
63
+ },
64
+ "attn_kernel_enabled": null,
65
+ "attn_tkg_builtin_kernel_enabled": false,
66
+ "attn_tkg_nki_kernel_enabled": false,
67
+ "batch_size": 1,
68
+ "bucket_n_active_tokens": true,
69
+ "buckets": [
70
+ 128
71
+ ],
72
+ "cast_type": "config",
73
+ "cc_pipeline_tiling_factor": 2,
74
+ "chunked_prefill_config": null,
75
+ "context_encoding_buckets": [
76
+ 128
77
+ ],
78
+ "cp_degree": 1,
79
+ "ctx_batch_size": 1,
80
+ "disable_kv_cache_tiling": false,
81
+ "draft_model_modules_to_not_convert": null,
82
+ "enable_bucketing": true,
83
+ "enable_cte_modular_flow": false,
84
+ "enable_eagle_draft_input_norm": false,
85
+ "enable_eagle_speculation": false,
86
+ "enable_fused_speculation": false,
87
+ "enable_long_context_mode": false,
88
+ "enable_output_completion_notifications": false,
89
+ "enable_spill_reload_dge": false,
90
+ "enable_token_tree": false,
91
+ "ep_degree": 1,
92
+ "expert_mlp_nki_kernel_enabled": null,
93
+ "flash_decoding_enabled": false,
94
+ "fused_qkv": false,
95
+ "fused_rmsnorm_skip_gamma": false,
96
+ "is_block_kv_layout": null,
97
+ "is_chunked_prefill": false,
98
+ "is_continuous_batching": true,
99
+ "is_eagle_draft": false,
100
+ "is_medusa": false,
101
+ "is_prefill_stage": true,
102
+ "is_prefix_caching": false,
103
+ "k_cache_transposed": false,
104
+ "kv_cache_batch_size": 8,
105
+ "kv_cache_padding_size": 0,
106
+ "kv_cache_quant": false,
107
+ "kv_cache_tiling": false,
108
+ "layer_boundary_markers": false,
109
+ "lm_head_pad": true,
110
+ "lm_head_pad_alignment_size": 1,
111
+ "local_ranks_size": 2,
112
+ "logical_nc_config": 2,
113
+ "lora_config": null,
114
+ "max_batch_size": 8,
115
+ "max_context_length": 4096,
116
+ "max_length": 4096,
117
+ "max_new_tokens": null,
118
+ "medusa_speculation_length": 0,
119
+ "medusa_tree": null,
120
+ "mlp_kernel_enabled": false,
121
+ "mlp_kernel_fuse_residual_add": false,
122
+ "modules_to_not_convert": null,
123
+ "moe_fused_nki_kernel_enabled": null,
124
+ "n_active_tokens": 4096,
125
+ "n_positions": 4096,
126
+ "num_medusa_heads": 0,
127
+ "on_cpu": false,
128
+ "on_device_sampling_config": {
129
+ "deterministic": false,
130
+ "do_sample": false,
131
+ "dynamic": true,
132
+ "global_topk": 256,
133
+ "on_device_sampling_config": true,
134
+ "temperature": 1.0,
135
+ "top_k": 1,
136
+ "top_k_kernel_enabled": false,
137
+ "top_p": 1.0
138
+ },
139
+ "output_logits": false,
140
+ "overrides_torch_dtype": true,
141
+ "pa_block_size": 4096,
142
+ "pa_num_blocks": 8,
143
+ "padding_side": "right",
144
+ "pp_degree": 1,
145
+ "prefix_buckets": null,
146
+ "qk_layernorm": false,
147
+ "qkv_kernel_enabled": false,
148
+ "qkv_kernel_fuse_residual_add": false,
149
+ "qkv_kernel_nbsd_layout": false,
150
+ "quantization_dtype": "int8",
151
+ "quantization_type": "per_tensor_symmetric",
152
+ "quantize_clamp_bound": Infinity,
153
+ "quantized": false,
154
+ "quantized_checkpoints_path": null,
155
+ "quantized_mlp_kernel_enabled": false,
156
+ "rmsnorm_quantize_kernel_enabled": false,
157
+ "router_topk_nki_kernel_enabled": null,
158
+ "rpl_reduce_dtype": null,
159
+ "save_sharded_checkpoint": true,
160
+ "scratchpad_page_size": null,
161
+ "seq_len": 4096,
162
+ "seq_len_threshold_for_cc_tiling": 16384,
163
+ "sequence_parallel_enabled": false,
164
+ "shared_mlp_nki_kernel_enabled": null,
165
+ "skip_sharding": false,
166
+ "skip_warmup": false,
167
+ "spec_batch_size": 8,
168
+ "speculation_length": 0,
169
+ "start_rank_id": 0,
170
+ "strided_context_parallel_kernel_enabled": false,
171
+ "target": null,
172
+ "tensor_capture_config": null,
173
+ "tile_cc": false,
174
+ "tkg_batch_size": 8,
175
+ "token_generation_buckets": null,
176
+ "token_tree_config": null,
177
+ "torch_dtype": "bfloat16",
178
+ "tp_degree": 2,
179
+ "vocab_parallel": false,
180
+ "weight_gather_seq_len_threshold": 32768,
181
+ "weights_to_skip_layout_optimization": [],
182
+ "world_size": 2
183
+ },
184
+ "no_repeat_ngram_size": 0,
185
+ "num_attention_heads": 16,
186
+ "num_beam_groups": 1,
187
+ "num_beams": 1,
188
+ "num_cores_per_group": 1,
189
+ "num_hidden_layers": 28,
190
+ "num_key_value_heads": 8,
191
+ "num_return_sequences": 1,
192
+ "output_attentions": false,
193
+ "output_hidden_states": false,
194
+ "output_scores": false,
195
+ "pad_token_id": 0,
196
+ "prefix": null,
197
+ "problem_type": null,
198
+ "pruned_heads": {},
199
+ "remove_invalid_values": false,
200
+ "repetition_penalty": 1.0,
201
+ "return_dict": true,
202
+ "return_dict_in_generate": false,
203
+ "rms_norm_eps": 1e-06,
204
+ "rope_scaling": null,
205
+ "rope_theta": 1000000,
206
+ "sep_token_id": null,
207
+ "sliding_window": null,
208
+ "suppress_tokens": null,
209
+ "task_specific_params": null,
210
+ "temperature": 1.0,
211
+ "tf_legacy_loss": false,
212
+ "tie_encoder_decoder": false,
213
+ "tie_word_embeddings": true,
214
+ "tokenizer_class": null,
215
+ "top_k": 50,
216
+ "top_p": 1.0,
217
+ "torchscript": false,
218
+ "transformers_version": "4.51.0",
219
+ "typical_p": 1.0,
220
+ "use_bfloat16": false,
221
+ "use_cache": true,
222
+ "use_sliding_window": false,
223
+ "vocab_size": 151936
224
+ }
context_encoding_model/_tp0_bk1/command.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ neuronx-cc compile --framework=XLA model.MODULE_2330bfb0632c950ddab1+62ecd68b.hlo_module.pb --output model.MODULE_2330bfb0632c950ddab1+62ecd68b.neff --target=trn2 --auto-cast=none --model-type=transformer '--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma ' --lnc=2 -O1 '--internal-hlo2tensorizer-options= --modular-flow-mac-threshold=10 --verify-hlo=true' --logfile=log-neuron-cc.txt --verbose=35
context_encoding_model/_tp0_bk1/compile_flags.MODULE_2330bfb0632c950ddab1+62ecd68b.json ADDED
@@ -0,0 +1 @@
 
 
1
+ ["--target=trn2", "--auto-cast=none", "--model-type=transformer", "--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma ", "--lnc=2", "-O1", "--internal-hlo2tensorizer-options= --modular-flow-mac-threshold=10 --verify-hlo=true", "--logfile=/home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk1/log-neuron-cc.txt"]
context_encoding_model/_tp0_bk1/global_metric_store.json ADDED
@@ -0,0 +1,1177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "Average": {
3
+ "tensorizer": {
4
+ "StaticProfiler::AverageFractalPeUtilization": 98.71436309814453,
5
+ "StaticProfiler::AveragePartitionUtilization": 94.08551025390625,
6
+ "StaticProfiler::AveragePeUtilization": 96.60899353027344,
7
+ "StaticProfiler::LocalizationEfficiency": 95.931884765625,
8
+ "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 99.52960968017578,
9
+ "TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0,
10
+ "TilingProfiler::AveragePeUtilizationAfterTiling": 0.0
11
+ }
12
+ },
13
+ "Count": {
14
+ "tensorizer": {
15
+ "StaticProfiler::AverageFractalPeUtilization": 1.0,
16
+ "StaticProfiler::AveragePartitionUtilization": 1.0,
17
+ "StaticProfiler::AveragePeUtilization": 1.0,
18
+ "StaticProfiler::LocalizationEfficiency": 1.0,
19
+ "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 1.0,
20
+ "TilingProfiler::AveragePartitionUtilizationAfterTiling": 1.0,
21
+ "TilingProfiler::AveragePeUtilizationAfterTiling": 1.0
22
+ }
23
+ },
24
+ "Sum": {
25
+ "compiletime": {
26
+ "AGOrderingAnalysisPass": 0.08984947204589844,
27
+ "AffinePredicateResolution": 0.0009312629699707031,
28
+ "AliasDependencyElimination": 0.00024366378784179688,
29
+ "AliasDependencyInduction": 0.005263328552246094,
30
+ "AliasDependencyReset": 0.04176759719848633,
31
+ "BFComputeCutting": 0.002216339111328125,
32
+ "BirCodeGenLoop": 0.3660314083099365,
33
+ "CCOpFusion": 0.04759931564331055,
34
+ "CanonicalizeConv": 4.999999873689376e-05,
35
+ "CanonicalizeDAGForPGTiling": 0.006819009780883789,
36
+ "CanonicalizeForTensorizer": 3.7000001611886546e-05,
37
+ "CanonicalizeIR": 0.0015099048614501953,
38
+ "Canonicalizer": 0.0008099999977275729,
39
+ "CoalesceCCOp": 0.014320611953735352,
40
+ "CommuteConcat": 0.0021598339080810547,
41
+ "DMALocalityOpt": 0.006499767303466797,
42
+ "DMAProfiler": 0.031740665435791016,
43
+ "DMATilingProfiler": 0.007287263870239258,
44
+ "DataLocalityOpt": 0.15184760093688965,
45
+ "DataStreaming": 0.030707597732543945,
46
+ "DeConcat": 0.0052378177642822266,
47
+ "DeadCodeElimination": 0.0020182132720947266,
48
+ "DeadStoreElimination": 0.007268428802490234,
49
+ "DelinearIndices": 0.006491422653198242,
50
+ "Delinearization": 0.00418853759765625,
51
+ "DelinearizeSPMD": 0.03150320053100586,
52
+ "DoNothing": 0.0004954338073730469,
53
+ "DramToDramTranspose": 0.028717756271362305,
54
+ "DumpGraphAndMetadata": 0.04632568359375,
55
+ "EliminateDivs": 0.0021729469299316406,
56
+ "ExpandBatchNorm": 0.0017549991607666016,
57
+ "ExpandISAMacro": 0.01276254653930664,
58
+ "FactorizeBlkDims": 0.07627987861633301,
59
+ "FactorizeThreadAxesInFreeDims": 0.0036237239837646484,
60
+ "FlattenMacroLoop": 0.012475728988647461,
61
+ "GenericAccessSimplifier": 0.0007128715515136719,
62
+ "HoistCompute": 9.999999747378752e-06,
63
+ "IdentifyCrossPassTensors": 3.899999865097925e-05,
64
+ "InferInitValue": 0.11746096611022949,
65
+ "InferIntrinsicOnCC": 0.008626222610473633,
66
+ "InferNeuronTensor": 0.17520785331726074,
67
+ "InferNonlocalTensors": 0.02865004539489746,
68
+ "InferPSumTensor": 0.097686767578125,
69
+ "InferShardAxis": 0.2832298278808594,
70
+ "InferSharedMemLoc": 0.024610280990600586,
71
+ "InlineNativeKernels": 0.0025413036346435547,
72
+ "InsertCoreBarrier": 0.014633417129516602,
73
+ "InsertIOTransposes": 0.058136701583862305,
74
+ "InsertImplicitShardAxisBeforeISel": 0.024377822875976563,
75
+ "InsertLocalTransposes": 0.016265153884887695,
76
+ "InsertOffloadedTransposes": 0.03376030921936035,
77
+ "LICM": 0.015621185302734375,
78
+ "LateLegalizeInst": 0.037809133529663086,
79
+ "LateLegalizePostSplit": 0.01734447479248047,
80
+ "LateLowerReshapeOp": 0.0016047954559326172,
81
+ "LateLowerTensorOp": 0.0011878013610839844,
82
+ "LateNeuronInstComb": 0.07452130317687988,
83
+ "LayoutPreprocessing": 0.05620622634887695,
84
+ "LayoutPreprocessingAndAnalysis": 0.18100428581237793,
85
+ "LayoutRequirementAnalysis": 0.014584064483642578,
86
+ "LegalizeCCOpLayout": 0.0032541751861572266,
87
+ "LegalizeOpLevelAlias": 0.0010030269622802734,
88
+ "LegalizePartitionReduce": 0.002452373504638672,
89
+ "LegalizeSundaAccess": 0.07152366638183594,
90
+ "LegalizeSundaMacro": 0.0427708625793457,
91
+ "LegalizeType": 0.03647494316101074,
92
+ "LocalLayoutOpt": 0.014898538589477539,
93
+ "LoopFusion": 0.005176067352294922,
94
+ "LoopSplitting": 0.00048732757568359375,
95
+ "LowerBroadcast": 0.019514799118041992,
96
+ "LowerCCOpBlockAxis": 0.004888296127319336,
97
+ "LowerComplexBroadcast": 0.010831594467163086,
98
+ "LowerIntrinsics": 0.05155062675476074,
99
+ "LowerShardAxis": 0.017355918884277344,
100
+ "LowerTensorOp": 0.013428449630737305,
101
+ "LowerToSendRecv": 0.038613319396972656,
102
+ "LowerTranspose": 0.058027029037475586,
103
+ "MacroGeneration": 0.1058506965637207,
104
+ "MaskPropagation": 0.004538536071777344,
105
+ "MemcastMotion": 2.2000000171829015e-05,
106
+ "MemcpyElimination": 0.04629826545715332,
107
+ "MutateDataType": 0.0012559890747070313,
108
+ "NeuronAliasDependencyInduction": 0.0006165504455566406,
109
+ "NeuronAliasDependencyReset": 0.03877615928649902,
110
+ "NeuronInstComb": 0.05556750297546387,
111
+ "NeuronLICM": 0.04741477966308594,
112
+ "NeuronLoopFusion": 0.08438324928283691,
113
+ "NeuronLoopInterchange": 0.0028100013732910156,
114
+ "NeuronSimplifier": 0.0370326042175293,
115
+ "NeuronSimplifyPredicates": 0.029002904891967773,
116
+ "NeuronValueNumbering": 0.014310836791992188,
117
+ "OptimizeAliasedCopyChain": 0.0005040168762207031,
118
+ "OptimizeNKIKernels": 4.637849807739258,
119
+ "PAGLayoutOpt": 0.15427088737487793,
120
+ "PComputeCutting": 0.022019147872924805,
121
+ "PGLayoutTilingPipeline": 1.5585658550262451,
122
+ "PGTiling": 0.3059046268463135,
123
+ "PadElimination": 0.00058746337890625,
124
+ "ParAxesAnnotation": 0.07737350463867188,
125
+ "PartialLoopFusion": 0.03046131134033203,
126
+ "PartialSimdFusion": 0.008630514144897461,
127
+ "PenguinizeFunctions": 3.699999797390774e-05,
128
+ "PerfectLoopNest": 0.0037374496459960938,
129
+ "PruneFunctions": 4.600000102072954e-05,
130
+ "RecognizeOpIdiom": 0.0049936771392822266,
131
+ "Recompute": 0.0004494190216064453,
132
+ "RelaxPredicates": 0.00769495964050293,
133
+ "Rematerialization": 0.0034401416778564453,
134
+ "RemoveOptimizationBarriers": 4.8000001697801054e-05,
135
+ "RemoveShardedPartitionAxes": 0.008293628692626953,
136
+ "ReshapeWeights": 0.004475116729736328,
137
+ "ResolveAccessConflict": 0.0053598880767822266,
138
+ "ResolveComplicatePredicates": 0.0009164810180664063,
139
+ "RewriteReplicationMatmul": 0.00577545166015625,
140
+ "RewriteWeights": 0.010277271270751953,
141
+ "SFKVectorizer": 0.2676401138305664,
142
+ "ScatterMotion": 3.199999991920777e-05,
143
+ "ShardingPropagationAnalysis": 0.06793785095214844,
144
+ "SimpleAllReduceTiling": 0.011077165603637695,
145
+ "Simplifier": 0.0029976367950439453,
146
+ "SimplifyMacroPredicates": 0.025454998016357422,
147
+ "SimplifyNeuronTensor": 0.13071107864379883,
148
+ "SimplifySlice": 0.0008246898651123047,
149
+ "SimplifyTensor": 0.03260469436645508,
150
+ "SpillPSum": 0.0713953971862793,
151
+ "SplitAPUnionSets": 0.08632850646972656,
152
+ "SplitAccGrp": 0.002518892288208008,
153
+ "StaticProfiler": 0.026699542999267578,
154
+ "StaticTransposeLocalTensor": 0.009710550308227539,
155
+ "SundaISel": 0.08615612983703613,
156
+ "TCTransform": 0.0014863014221191406,
157
+ "TensorInitialization": 0.017354965209960938,
158
+ "TensorOpSimplifier": 0.004897356033325195,
159
+ "TensorOpTransform": 0.026237010955810547,
160
+ "TensorizerLegalizationPass": 4.099999932805076e-05,
161
+ "TileCCOps": 0.007733821868896484,
162
+ "TilingProfiler": 0.03455352783203125,
163
+ "TransformConvOp": 0.0042724609375,
164
+ "TritiumFusion": 0.11825895309448242,
165
+ "ValueNumbering": 0.0019876956939697266,
166
+ "VectorizeDMA": 0.03213214874267578,
167
+ "VectorizeMatMult": 0.010382413864135742,
168
+ "VerifySupportedOps": 3.300000025774352e-05,
169
+ "WeightCoalescing": 0.010597944259643555,
170
+ "ZeroSizeTensorElimination": 0.00017881393432617188,
171
+ "algsimp": 0.0017300000181421638,
172
+ "batchnorm_expander": 3.5000000934815034e-05,
173
+ "boundary-marker-removal": 1.2000000424450263e-05,
174
+ "call-inliner": 0.00022000000171829015,
175
+ "canonicalize-boundary-marker": 2.2000000171829015e-05,
176
+ "collective-stream-id-checker": 6.299999949987978e-05,
177
+ "comparison-expander": 0.0005039999959990382,
178
+ "computation-deduplicator": 5.8999998145736754e-05,
179
+ "config-lowering": 9.800000407267362e-05,
180
+ "constant-statistics": 0.0004199999966658652,
181
+ "constant_folding": 0.00015699998766649514,
182
+ "cse": 3.699999797390774e-05,
183
+ "dce": 4.099999932805076e-05,
184
+ "dot_decomposer": 0.0009689999860711396,
185
+ "dynamic-slice-transpose": 1.4999999621068127e-05,
186
+ "eliminate-redundant-compare": 0.00013899999612476677,
187
+ "emit-offloaded-dropout": 3.900000228895806e-05,
188
+ "flatten-call-graph": 0.0006180000491440296,
189
+ "fuse-send-recv": 5.7999997807201e-05,
190
+ "hilo-conditional-to-select": 1.2999999853491317e-05,
191
+ "hilo::LegalizeAlias": 1.1000000085914508e-05,
192
+ "hilo::NeuronInstCombine": 0.0001770000089891255,
193
+ "hilo::NeuronOpFusion": 3.7999998312443495e-05,
194
+ "hilo::ReplaceTokenTypeWithU8Pass": 4.199999966658652e-05,
195
+ "hilo::ScheduleFusion": 3.999999989900971e-06,
196
+ "hilo::SixtyFourHack": 6.199999916134402e-05,
197
+ "hilo::VerifyAliasing": 4.999999873689376e-06,
198
+ "hlo-mac-count": 0.011359000578522682,
199
+ "instruction-histogram": 0.0004990000161342323,
200
+ "io-con-pipe-begin": 3.999999989900971e-06,
201
+ "io-con-pipe-end": 9.999999974752427e-07,
202
+ "io-layout-normalization": 0.0007779999868944287,
203
+ "io-statistics": 4.099999932805076e-05,
204
+ "legalize-ccops-for-tensorizer": 3.999999989900971e-06,
205
+ "legalize-compare": 1.1000000085914508e-05,
206
+ "lower-argminmax-custom-call": 9.999999747378752e-06,
207
+ "map-inline": 0.0007570000016130507,
208
+ "metadata-naming": 4.8000001697801054e-05,
209
+ "mlir::detail::OpToOpPassAdaptor": 6.500000017695129e-05,
210
+ "mlir::hlo::MhloToPyPenguin": 0.006823000032454729,
211
+ "mlir::mhlo::LowerComplexExtraPass": 0.00024300000222865492,
212
+ "mlir::mhlo::LowerComplexPass": 0.0003090000245720148,
213
+ "native-to-custom-softmax": 0.00030399998649954796,
214
+ "native-to-custom-softmax-dx": 0.0016090000281110406,
215
+ "neuron-hlo-verifier": 0.010127999819815159,
216
+ "operand_upcaster": 4.199999966658652e-05,
217
+ "opt-barrier-removal": 0.00026199998683296144,
218
+ "post-par-pipe-begin": 0.00030399998649954796,
219
+ "post-par-pipe-end": 0.0,
220
+ "post-partition-simplification": 0.0014479999663308263,
221
+ "pre-par-pipe-begin": 9.999999974752427e-07,
222
+ "pre-par-pipe-end": 0.0,
223
+ "pre-partition-simplification": 0.05613299831748009,
224
+ "replace-minimum-constant": 0.00029700002050958574,
225
+ "reshape-mover": 5.60000044060871e-05,
226
+ "simplify-concat": 0.00010799999290611595,
227
+ "simplify-while-loops": 5.0000002374872565e-05,
228
+ "transform-variadic-reduce": 6.299999949987978e-05,
229
+ "tuple-simplifier": 0.00014699998428113759,
230
+ "unpack-nested-aws-ntwsr": 0.00021999998716637492,
231
+ "unroll-while-loop": 7.000000096013537e-06,
232
+ "zero_sized_hlo_elimination": 0.0007450000266544521
233
+ },
234
+ "hilo": {
235
+ "ConstantSize": 467583.0,
236
+ "HloInputCount": 371.0,
237
+ "HloMacCount": 13175750656.0,
238
+ "HloOutputCount": 57.0,
239
+ "IfmapSize": 3910914048.0,
240
+ "OfmapSize": 1879048192.0,
241
+ "OutputsReadFromCount": 0.0,
242
+ "PassthroughTensorsCount": 0.0,
243
+ "RedundantOutputCount": 0.0,
244
+ "Traffic": 871990400.0
245
+ },
246
+ "tensorizer": {
247
+ "DMATilingProfiler::TotalInstructionsAfterTiling": 20919.0,
248
+ "StaticProfiler::AifUb": 147.03309631347656,
249
+ "StaticProfiler::ArithmeticIntensityTensorizer": 141.05162048339844,
250
+ "StaticProfiler::AverageDmaLength": 2425.82958984375,
251
+ "StaticProfiler::DDRTransferBytes": 365941792.0,
252
+ "StaticProfiler::InternalTransferBytes": 325506848.0,
253
+ "StaticProfiler::LoadExpanded": 84060.0,
254
+ "StaticProfiler::StoreExpanded": 1898.0,
255
+ "StaticProfiler::TotalDMAExpanded": 85958.0,
256
+ "StaticProfiler::TotalDynamicInstancesCount": 25383.0,
257
+ "StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 24932.0,
258
+ "StaticProfiler::TotalLNCComm": 0.0,
259
+ "StaticProfiler::TotalLNCCommTransfer": 0.0,
260
+ "TilingProfiler::BatchnormInstructionsAfterTiling": 0.0,
261
+ "TilingProfiler::DmaInstructionsAfterTiling": 0.0,
262
+ "TilingProfiler::GenericInstructionsAfterTiling": 4.0,
263
+ "TilingProfiler::MatMultInstructionsAfterTiling": 10464.0,
264
+ "TilingProfiler::NumPfTransposes": 6.0,
265
+ "TilingProfiler::NumPfTransposesForIo": 1.0,
266
+ "TilingProfiler::NumPfTransposesForLocal": 1.0,
267
+ "TilingProfiler::NumPfTransposesForNonlocal": 4.0,
268
+ "TilingProfiler::PfTransposeInstructions": 10195.0,
269
+ "TilingProfiler::PfTransposeInstructionsForIo": 9504.0,
270
+ "TilingProfiler::PfTransposeInstructionsForLocal": 1.0,
271
+ "TilingProfiler::PfTransposeInstructionsForNonlocal": 690.0,
272
+ "TilingProfiler::ReduceInstructionsAfterTiling": 4.0,
273
+ "TilingProfiler::SimdInstructionsAfterTiling": 92.0,
274
+ "TilingProfiler::TotalInstructionsAfterTiling": 0.0,
275
+ "TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0,
276
+ "TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0,
277
+ "TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0,
278
+ "TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0,
279
+ "TransformConvOp::conv2d_column_packing": 0.0,
280
+ "TransformConvOp::conv2d_column_packing_1": 0.0,
281
+ "TransformConvOp::conv2d_column_packing_io10": 0.0,
282
+ "TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0
283
+ }
284
+ },
285
+ "all": {
286
+ "compiletime": {
287
+ "algsimp": 0.0015739999944344163,
288
+ "call-inliner": 0.00019500000053085387,
289
+ "collective-stream-id-checker": 5.400000009103678e-05,
290
+ "comparison-expander": 0.0004710000066552311,
291
+ "constant-statistics": 0.0004199999966658652,
292
+ "constant_folding": 0.0001320000010309741,
293
+ "dce": 3.7999998312443495e-05,
294
+ "dot_decomposer": 0.0009689999860711396,
295
+ "eliminate-redundant-compare": 0.00011899999663000926,
296
+ "flatten-call-graph": 0.0005910000181756914,
297
+ "hlo-mac-count": 0.006432000081986189,
298
+ "instruction-histogram": 0.0004990000161342323,
299
+ "io-con-pipe-begin": 3.999999989900971e-06,
300
+ "io-con-pipe-end": 9.999999974752427e-07,
301
+ "io-layout-normalization": 0.0007779999868944287,
302
+ "io-statistics": 4.099999932805076e-05,
303
+ "map-inline": 0.0007220000261440873,
304
+ "native-to-custom-softmax": 0.00028899998869746923,
305
+ "native-to-custom-softmax-dx": 0.00046099998871795833,
306
+ "neuron-hlo-verifier": 0.0090549997985363,
307
+ "opt-barrier-removal": 0.00026199998683296144,
308
+ "pre-par-pipe-begin": 9.999999974752427e-07,
309
+ "pre-par-pipe-end": 0.0,
310
+ "pre-partition-simplification": 0.05613299831748009,
311
+ "replace-minimum-constant": 0.0002770000137388706,
312
+ "reshape-mover": 4.70000013592653e-05,
313
+ "simplify-while-loops": 4.3000000005122274e-05,
314
+ "tuple-simplifier": 0.00013299999409355223,
315
+ "unpack-nested-aws-ntwsr": 0.00020799999765586108,
316
+ "unroll-while-loop": 7.000000096013537e-06,
317
+ "zero_sized_hlo_elimination": 0.0007450000266544521
318
+ }
319
+ },
320
+ "attention_isa_kernel": {
321
+ "compiletime": {
322
+ "CoalesceCCOp": 0.00019693374633789063,
323
+ "DMALocalityOpt": 0.00016736984252929688,
324
+ "DMAProfiler": 0.00026297569274902344,
325
+ "DataStreaming": 0.0002357959747314453,
326
+ "DoNothing": 0.004472255706787109,
327
+ "ExpandISAMacro": 0.00024008750915527344,
328
+ "FactorizeBlkDims": 0.001956939697265625,
329
+ "InferPSumTensor": 0.0005483627319335938,
330
+ "InferSharedMemLoc": 0.0012214183807373047,
331
+ "InsertCoreBarrier": 0.000339508056640625,
332
+ "LateLegalizeInst": 0.00020360946655273438,
333
+ "LateNeuronInstComb": 0.002096414566040039,
334
+ "LegalizeSundaAccess": 0.00022792816162109375,
335
+ "LegalizeType": 0.00030231475830078125,
336
+ "LowerBroadcast": 0.0002613067626953125,
337
+ "LowerIntrinsics": 0.0003268718719482422,
338
+ "LowerTranspose": 0.0002701282501220703,
339
+ "NeuronInstComb": 0.000457763671875,
340
+ "NeuronLICM": 0.0002644062042236328,
341
+ "NeuronSimplifyPredicates": 0.0002472400665283203,
342
+ "NeuronValueNumbering": 0.00029158592224121094,
343
+ "SFKVectorizer": 0.002269744873046875,
344
+ "SimpleAllReduceTiling": 0.00020956993103027344,
345
+ "SimplifyNeuronTensor": 0.0006353855133056641,
346
+ "SpillPSum": 0.0006325244903564453,
347
+ "WeightCoalescing": 0.00021409988403320313
348
+ }
349
+ },
350
+ "cumsum": {
351
+ "compiletime": {
352
+ "CoalesceCCOp": 0.00030303001403808594,
353
+ "DMALocalityOpt": 0.00025963783264160156,
354
+ "DMAProfiler": 0.0011391639709472656,
355
+ "DataStreaming": 0.0004107952117919922,
356
+ "DoNothing": 0.00016951560974121094,
357
+ "ExpandISAMacro": 0.0008628368377685547,
358
+ "FactorizeBlkDims": 0.0031676292419433594,
359
+ "InferPSumTensor": 0.0011391639709472656,
360
+ "InferSharedMemLoc": 0.0004911422729492188,
361
+ "InsertCoreBarrier": 0.0014476776123046875,
362
+ "LateLegalizeInst": 0.0051555633544921875,
363
+ "LateNeuronInstComb": 0.0011050701141357422,
364
+ "LegalizeSundaAccess": 0.0025599002838134766,
365
+ "LegalizeType": 0.0004215240478515625,
366
+ "LowerBroadcast": 0.0014843940734863281,
367
+ "LowerIntrinsics": 0.0016138553619384766,
368
+ "LowerTranspose": 0.00037097930908203125,
369
+ "NeuronInstComb": 0.0021207332611083984,
370
+ "NeuronLICM": 0.0007026195526123047,
371
+ "NeuronSimplifyPredicates": 0.004625082015991211,
372
+ "NeuronValueNumbering": 0.0007369518280029297,
373
+ "SFKVectorizer": 0.005678415298461914,
374
+ "SimpleAllReduceTiling": 0.0004096031188964844,
375
+ "SimplifyNeuronTensor": 0.0030858516693115234,
376
+ "SpillPSum": 0.0021026134490966797,
377
+ "WeightCoalescing": 0.0003502368927001953
378
+ }
379
+ },
380
+ "sg00": {
381
+ "compiletime": {
382
+ "CanonicalizeConv": 2.499999936844688e-05,
383
+ "CanonicalizeForTensorizer": 1.2999999853491317e-05,
384
+ "Canonicalizer": 0.00033400001120753586,
385
+ "HoistCompute": 3.000000106112566e-06,
386
+ "IdentifyCrossPassTensors": 1.4999999621068127e-05,
387
+ "MemcastMotion": 1.1000000085914508e-05,
388
+ "PenguinizeFunctions": 1.4999999621068127e-05,
389
+ "PruneFunctions": 1.4000000192027073e-05,
390
+ "RemoveOptimizationBarriers": 2.099999983329326e-05,
391
+ "ScatterMotion": 2.9999999242136255e-05,
392
+ "TensorizerLegalizationPass": 1.8999999156221747e-05,
393
+ "VerifySupportedOps": 1.1000000085914508e-05,
394
+ "algsimp": 5.8000001445179805e-05,
395
+ "batchnorm_expander": 1.1000000085914508e-05,
396
+ "boundary-marker-removal": 3.999999989900971e-06,
397
+ "call-inliner": 7.999999979801942e-06,
398
+ "canonicalize-boundary-marker": 1.2000000424450263e-05,
399
+ "collective-stream-id-checker": 3.000000106112566e-06,
400
+ "comparison-expander": 4.999999873689376e-06,
401
+ "computation-deduplicator": 1.8000000636675395e-05,
402
+ "config-lowering": 2.9999999242136255e-05,
403
+ "constant_folding": 9.000000318337698e-06,
404
+ "cse": 1.4000000192027073e-05,
405
+ "dce": 9.999999974752427e-07,
406
+ "dynamic-slice-transpose": 4.999999873689376e-06,
407
+ "eliminate-redundant-compare": 3.999999989900971e-06,
408
+ "emit-offloaded-dropout": 1.4000000192027073e-05,
409
+ "flatten-call-graph": 9.000000318337698e-06,
410
+ "fuse-send-recv": 2.099999983329326e-05,
411
+ "hilo-conditional-to-select": 3.000000106112566e-06,
412
+ "hilo::LegalizeAlias": 4.999999873689376e-06,
413
+ "hilo::NeuronInstCombine": 5.700000110664405e-05,
414
+ "hilo::NeuronOpFusion": 4.999999873689376e-06,
415
+ "hilo::ReplaceTokenTypeWithU8Pass": 1.1000000085914508e-05,
416
+ "hilo::ScheduleFusion": 9.999999974752427e-07,
417
+ "hilo::SixtyFourHack": 9.999999747378752e-06,
418
+ "hilo::VerifyAliasing": 1.9999999949504854e-06,
419
+ "hlo-mac-count": 7.899999764049426e-05,
420
+ "legalize-ccops-for-tensorizer": 1.9999999949504854e-06,
421
+ "legalize-compare": 3.999999989900971e-06,
422
+ "lower-argminmax-custom-call": 3.000000106112566e-06,
423
+ "map-inline": 1.2000000424450263e-05,
424
+ "metadata-naming": 1.4999999621068127e-05,
425
+ "mlir::detail::OpToOpPassAdaptor": 2.2000000171829015e-05,
426
+ "mlir::hlo::MhloToPyPenguin": 0.0009059999720193446,
427
+ "mlir::mhlo::LowerComplexExtraPass": 9.600000339560211e-05,
428
+ "mlir::mhlo::LowerComplexPass": 0.00018000000272877514,
429
+ "native-to-custom-softmax": 4.999999873689376e-06,
430
+ "native-to-custom-softmax-dx": 0.0011220000451430678,
431
+ "neuron-hlo-verifier": 0.00035700001171790063,
432
+ "operand_upcaster": 1.8000000636675395e-05,
433
+ "post-par-pipe-begin": 0.0003020000003743917,
434
+ "post-par-pipe-end": 0.0,
435
+ "post-partition-simplification": 0.0005360000068321824,
436
+ "replace-minimum-constant": 7.000000096013537e-06,
437
+ "reshape-mover": 3.000000106112566e-06,
438
+ "simplify-concat": 3.400000059627928e-05,
439
+ "simplify-while-loops": 3.000000106112566e-06,
440
+ "transform-variadic-reduce": 7.999999979801942e-06,
441
+ "tuple-simplifier": 4.999999873689376e-06,
442
+ "unpack-nested-aws-ntwsr": 3.999999989900971e-06,
443
+ "unroll-while-loop": 0.0
444
+ },
445
+ "hilo": {
446
+ "ArithmeticIntensity": 8.479304313659668,
447
+ "ConstantSize": 467583.0,
448
+ "HloInputCount": 371.0,
449
+ "HloMacCount": 1677721600.0,
450
+ "HloOutputCount": 57.0,
451
+ "IfmapSize": 3910914048.0,
452
+ "OfmapSize": 1879048192.0,
453
+ "OutputsReadFromCount": 0.0,
454
+ "PassthroughTensorsCount": 0.0,
455
+ "RedundantOutputCount": 0.0,
456
+ "Traffic": 395721504.0
457
+ }
458
+ },
459
+ "sg0000": {
460
+ "compiletime": {
461
+ "AGOrderingAnalysisPass": 0.05208444595336914,
462
+ "AffinePredicateResolution": 0.002167940139770508,
463
+ "AliasDependencyElimination": 0.00020503997802734375,
464
+ "AliasDependencyInduction": 0.006783246994018555,
465
+ "AliasDependencyReset": 0.20125508308410645,
466
+ "BFComputeCutting": 0.007937908172607422,
467
+ "BirCodeGenLoop": 0.10184049606323242,
468
+ "CCOpFusion": 0.03359842300415039,
469
+ "CanonicalizeDAGForPGTiling": 0.003628253936767578,
470
+ "CanonicalizeIR": 0.0030901432037353516,
471
+ "CoalesceCCOp": 0.017004013061523438,
472
+ "CommuteConcat": 0.0019147396087646484,
473
+ "DMALocalityOpt": 0.008169889450073242,
474
+ "DMAProfiler": 0.019730091094970703,
475
+ "DMATilingProfiler": 0.01212453842163086,
476
+ "DataLocalityOpt": 0.20879435539245605,
477
+ "DataStreaming": 0.017726421356201172,
478
+ "DeConcat": 0.0039784908294677734,
479
+ "DeadCodeElimination": 0.0020265579223632813,
480
+ "DeadStoreElimination": 0.023813247680664063,
481
+ "DelinearIndices": 0.020769357681274414,
482
+ "Delinearization": 0.03343391418457031,
483
+ "DelinearizeSPMD": 0.0467836856842041,
484
+ "DoNothing": 8.96453857421875e-05,
485
+ "DramToDramTranspose": 0.029311418533325195,
486
+ "DumpGraphAndMetadata": 0.008599281311035156,
487
+ "EliminateDivs": 0.003629446029663086,
488
+ "ExpandBatchNorm": 0.0015780925750732422,
489
+ "ExpandISAMacro": 0.006983280181884766,
490
+ "FactorizeBlkDims": 0.02126312255859375,
491
+ "FactorizeThreadAxesInFreeDims": 0.003243684768676758,
492
+ "FlattenMacroLoop": 0.0065686702728271484,
493
+ "GenericAccessSimplifier": 0.001466512680053711,
494
+ "InferInitValue": 0.04482269287109375,
495
+ "InferIntrinsicOnCC": 0.01812601089477539,
496
+ "InferNeuronTensor": 0.10232234001159668,
497
+ "InferNonlocalTensors": 0.17829585075378418,
498
+ "InferPSumTensor": 0.08844804763793945,
499
+ "InferShardAxis": 0.7131092548370361,
500
+ "InferSharedMemLoc": 0.007193565368652344,
501
+ "InlineNativeKernels": 0.006009101867675781,
502
+ "InsertCoreBarrier": 0.015059709548950195,
503
+ "InsertIOTransposes": 0.07647299766540527,
504
+ "InsertImplicitShardAxisBeforeISel": 0.020087480545043945,
505
+ "InsertLocalTransposes": 0.037857770919799805,
506
+ "InsertOffloadedTransposes": 0.022881269454956055,
507
+ "LICM": 0.012552261352539063,
508
+ "LateLegalizeInst": 0.025588512420654297,
509
+ "LateLegalizePostSplit": 0.012372970581054688,
510
+ "LateLowerReshapeOp": 0.004400491714477539,
511
+ "LateLowerTensorOp": 0.004253387451171875,
512
+ "LateNeuronInstComb": 0.039977073669433594,
513
+ "LayoutPreprocessing": 0.06799173355102539,
514
+ "LayoutPreprocessingAndAnalysis": 0.1176137924194336,
515
+ "LayoutRequirementAnalysis": 0.01578998565673828,
516
+ "LegalizeCCOpLayout": 0.0030679702758789063,
517
+ "LegalizeOpLevelAlias": 0.0017116069793701172,
518
+ "LegalizePartitionReduce": 0.002843618392944336,
519
+ "LegalizeSundaAccess": 0.08243513107299805,
520
+ "LegalizeSundaMacro": 0.02523207664489746,
521
+ "LegalizeType": 0.014882326126098633,
522
+ "LocalLayoutOpt": 0.019226789474487305,
523
+ "LoopFusion": 0.007382631301879883,
524
+ "LoopSplitting": 0.0006470680236816406,
525
+ "LowerBroadcast": 0.005588054656982422,
526
+ "LowerCCOpBlockAxis": 0.0077972412109375,
527
+ "LowerComplexBroadcast": 0.005771636962890625,
528
+ "LowerIntrinsics": 0.06823062896728516,
529
+ "LowerShardAxis": 0.01669931411743164,
530
+ "LowerTensorOp": 0.028963327407836914,
531
+ "LowerToSendRecv": 0.003696441650390625,
532
+ "LowerTranspose": 0.022225618362426758,
533
+ "MacroGeneration": 0.0702672004699707,
534
+ "MaskPropagation": 0.010986804962158203,
535
+ "MemcpyElimination": 0.1031653881072998,
536
+ "MutateDataType": 0.0030710697174072266,
537
+ "NeuronAliasDependencyInduction": 0.0008504390716552734,
538
+ "NeuronAliasDependencyReset": 0.10823488235473633,
539
+ "NeuronInstComb": 0.032953739166259766,
540
+ "NeuronLICM": 0.018877506256103516,
541
+ "NeuronLoopFusion": 0.03511810302734375,
542
+ "NeuronLoopInterchange": 0.009130239486694336,
543
+ "NeuronSimplifier": 0.02072596549987793,
544
+ "NeuronSimplifyPredicates": 0.005728721618652344,
545
+ "NeuronValueNumbering": 0.017284870147705078,
546
+ "OptimizeAliasedCopyChain": 0.0006775856018066406,
547
+ "OptimizeNKIKernels": 0.5134098529815674,
548
+ "PAGLayoutOpt": 0.5583286285400391,
549
+ "PComputeCutting": 0.026990413665771484,
550
+ "PGLayoutTilingPipeline": 2.505728006362915,
551
+ "PGTiling": 0.4031352996826172,
552
+ "PadElimination": 0.0005686283111572266,
553
+ "ParAxesAnnotation": 0.48941731452941895,
554
+ "PartialLoopFusion": 0.03877878189086914,
555
+ "PartialSimdFusion": 0.05450034141540527,
556
+ "PerfectLoopNest": 0.006276607513427734,
557
+ "RecognizeOpIdiom": 0.006324291229248047,
558
+ "Recompute": 0.0004134178161621094,
559
+ "RelaxPredicates": 0.008553743362426758,
560
+ "Rematerialization": 0.012713193893432617,
561
+ "RemoveShardedPartitionAxes": 0.04062914848327637,
562
+ "ReshapeWeights": 0.0019867420196533203,
563
+ "ResolveAccessConflict": 0.006893634796142578,
564
+ "ResolveComplicatePredicates": 0.0020072460174560547,
565
+ "RewriteReplicationMatmul": 0.002567291259765625,
566
+ "RewriteWeights": 0.008040666580200195,
567
+ "SFKVectorizer": 0.35219240188598633,
568
+ "ShardingPropagationAnalysis": 0.03732752799987793,
569
+ "SimpleAllReduceTiling": 0.00998234748840332,
570
+ "Simplifier": 0.00720524787902832,
571
+ "SimplifyMacroPredicates": 0.008156061172485352,
572
+ "SimplifyNeuronTensor": 0.020155906677246094,
573
+ "SimplifySlice": 0.0016894340515136719,
574
+ "SimplifyTensor": 0.01220250129699707,
575
+ "SpillPSum": 0.03788638114929199,
576
+ "SplitAPUnionSets": 0.05510139465332031,
577
+ "SplitAccGrp": 0.006468534469604492,
578
+ "StaticProfiler": 0.017852783203125,
579
+ "StaticTransposeLocalTensor": 0.00736546516418457,
580
+ "SundaISel": 0.09026622772216797,
581
+ "TCTransform": 0.0017704963684082031,
582
+ "TensorInitialization": 0.010450363159179688,
583
+ "TensorOpSimplifier": 0.02020740509033203,
584
+ "TensorOpTransform": 0.027513504028320313,
585
+ "TileCCOps": 0.008568286895751953,
586
+ "TilingProfiler": 0.03838157653808594,
587
+ "TransformConvOp": 0.007506370544433594,
588
+ "TritiumFusion": 0.050549983978271484,
589
+ "ValueNumbering": 0.0038373470306396484,
590
+ "VectorizeDMA": 0.017205238342285156,
591
+ "VectorizeMatMult": 0.021669626235961914,
592
+ "WeightCoalescing": 0.004259347915649414,
593
+ "ZeroSizeTensorElimination": 0.00019121170043945313
594
+ },
595
+ "tensorizer": {
596
+ "DMATilingProfiler::TotalInstructionsAfterTiling": 847.0,
597
+ "StaticProfiler::AifUb": 8.478300094604492,
598
+ "StaticProfiler::ArithmeticIntensityTensorizer": 131.77493286132813,
599
+ "StaticProfiler::AverageDmaLength": 1355.7093505859375,
600
+ "StaticProfiler::AverageFractalPeUtilization": 99.68699645996094,
601
+ "StaticProfiler::AveragePartitionUtilization": 99.0614013671875,
602
+ "StaticProfiler::AveragePeUtilization": 99.3685073852539,
603
+ "StaticProfiler::DDRTransferBytes": 29617926.0,
604
+ "StaticProfiler::InternalTransferBytes": 11470848.0,
605
+ "StaticProfiler::LoadExpanded": 12422.0,
606
+ "StaticProfiler::LocalizationEfficiency": 1554.2613525390625,
607
+ "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 1973.466064453125,
608
+ "StaticProfiler::StoreExpanded": 5889.0,
609
+ "StaticProfiler::TotalDMAExpanded": 18311.0,
610
+ "StaticProfiler::TotalDynamicInstancesCount": 1115.0,
611
+ "StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 1113.0,
612
+ "StaticProfiler::TotalLNCComm": 0.0,
613
+ "StaticProfiler::TotalLNCCommTransfer": 0.0,
614
+ "TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0,
615
+ "TilingProfiler::AveragePeUtilizationAfterTiling": 0.0,
616
+ "TilingProfiler::BatchnormInstructionsAfterTiling": 0.0,
617
+ "TilingProfiler::DmaInstructionsAfterTiling": 0.0,
618
+ "TilingProfiler::GenericInstructionsAfterTiling": 20.0,
619
+ "TilingProfiler::MatMultInstructionsAfterTiling": 514.0,
620
+ "TilingProfiler::NumPfTransposes": 6.0,
621
+ "TilingProfiler::NumPfTransposesForIo": 1.0,
622
+ "TilingProfiler::NumPfTransposesForLocal": 2.0,
623
+ "TilingProfiler::NumPfTransposesForNonlocal": 3.0,
624
+ "TilingProfiler::PfTransposeInstructions": 104.0,
625
+ "TilingProfiler::PfTransposeInstructionsForIo": 32.0,
626
+ "TilingProfiler::PfTransposeInstructionsForLocal": 24.0,
627
+ "TilingProfiler::PfTransposeInstructionsForNonlocal": 48.0,
628
+ "TilingProfiler::ReduceInstructionsAfterTiling": 0.0,
629
+ "TilingProfiler::SimdInstructionsAfterTiling": 86.0,
630
+ "TilingProfiler::TotalInstructionsAfterTiling": 0.0,
631
+ "TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0,
632
+ "TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0,
633
+ "TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0,
634
+ "TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0,
635
+ "TransformConvOp::conv2d_column_packing": 0.0,
636
+ "TransformConvOp::conv2d_column_packing_1": 0.0,
637
+ "TransformConvOp::conv2d_column_packing_io10": 0.0,
638
+ "TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0
639
+ }
640
+ },
641
+ "sg0001": {
642
+ "compiletime": {
643
+ "AGOrderingAnalysisPass": 0.10172820091247559,
644
+ "AffinePredicateResolution": 0.0019948482513427734,
645
+ "AliasDependencyElimination": 0.0002758502960205078,
646
+ "AliasDependencyInduction": 0.007086038589477539,
647
+ "AliasDependencyReset": 0.13438987731933594,
648
+ "BFComputeCutting": 0.0027761459350585938,
649
+ "BirCodeGenLoop": 0.05368757247924805,
650
+ "CCOpFusion": 0.03205680847167969,
651
+ "CanonicalizeDAGForPGTiling": 0.0052297115325927734,
652
+ "CanonicalizeIR": 0.002682924270629883,
653
+ "CoalesceCCOp": 0.008353471755981445,
654
+ "CommuteConcat": 0.0031654834747314453,
655
+ "DMALocalityOpt": 0.0032248497009277344,
656
+ "DMAProfiler": 0.006761789321899414,
657
+ "DMATilingProfiler": 0.00853872299194336,
658
+ "DataLocalityOpt": 0.366649866104126,
659
+ "DataStreaming": 0.008889198303222656,
660
+ "DeConcat": 0.002901792526245117,
661
+ "DeadCodeElimination": 0.016579151153564453,
662
+ "DeadStoreElimination": 0.029788732528686523,
663
+ "DelinearIndices": 0.019867897033691406,
664
+ "Delinearization": 0.0065822601318359375,
665
+ "DelinearizeSPMD": 0.023911237716674805,
666
+ "DoNothing": 7.867813110351563e-05,
667
+ "DramToDramTranspose": 0.026773452758789063,
668
+ "DumpGraphAndMetadata": 0.006331682205200195,
669
+ "EliminateDivs": 0.006492137908935547,
670
+ "ExpandBatchNorm": 0.0019371509552001953,
671
+ "ExpandISAMacro": 0.011901378631591797,
672
+ "FactorizeBlkDims": 0.03787398338317871,
673
+ "FactorizeThreadAxesInFreeDims": 0.0023696422576904297,
674
+ "FlattenMacroLoop": 0.006732463836669922,
675
+ "GenericAccessSimplifier": 0.0011754035949707031,
676
+ "InferInitValue": 0.07735943794250488,
677
+ "InferIntrinsicOnCC": 0.017465829849243164,
678
+ "InferNeuronTensor": 0.09335732460021973,
679
+ "InferNonlocalTensors": 0.029421567916870117,
680
+ "InferPSumTensor": 0.12906312942504883,
681
+ "InferShardAxis": 0.7434248924255371,
682
+ "InferSharedMemLoc": 0.005700111389160156,
683
+ "InlineNativeKernels": 0.002834320068359375,
684
+ "InsertCoreBarrier": 0.006781339645385742,
685
+ "InsertIOTransposes": 0.0841522216796875,
686
+ "InsertImplicitShardAxisBeforeISel": 0.012434244155883789,
687
+ "InsertLocalTransposes": 0.019251346588134766,
688
+ "InsertOffloadedTransposes": 0.028300762176513672,
689
+ "LICM": 0.005795001983642578,
690
+ "LateLegalizeInst": 0.011514902114868164,
691
+ "LateLegalizePostSplit": 0.005158185958862305,
692
+ "LateLowerReshapeOp": 0.0047490596771240234,
693
+ "LateLowerTensorOp": 0.004218101501464844,
694
+ "LateNeuronInstComb": 0.047844648361206055,
695
+ "LayoutPreprocessing": 0.03463029861450195,
696
+ "LayoutPreprocessingAndAnalysis": 0.06621217727661133,
697
+ "LayoutRequirementAnalysis": 0.007728099822998047,
698
+ "LegalizeCCOpLayout": 0.003231048583984375,
699
+ "LegalizeOpLevelAlias": 0.001981973648071289,
700
+ "LegalizePartitionReduce": 0.0027234554290771484,
701
+ "LegalizeSundaAccess": 0.04511404037475586,
702
+ "LegalizeSundaMacro": 0.022600412368774414,
703
+ "LegalizeType": 0.0190885066986084,
704
+ "LocalLayoutOpt": 0.04217672348022461,
705
+ "LoopFusion": 0.012153148651123047,
706
+ "LoopSplitting": 0.0006983280181884766,
707
+ "LowerBroadcast": 0.001943826675415039,
708
+ "LowerCCOpBlockAxis": 0.007781505584716797,
709
+ "LowerComplexBroadcast": 0.004039287567138672,
710
+ "LowerIntrinsics": 0.08824563026428223,
711
+ "LowerShardAxis": 0.008327722549438477,
712
+ "LowerTensorOp": 0.033898115158081055,
713
+ "LowerToSendRecv": 0.005768775939941406,
714
+ "LowerTranspose": 0.02297377586364746,
715
+ "MacroGeneration": 0.16904258728027344,
716
+ "MaskPropagation": 0.007157087326049805,
717
+ "MemcpyElimination": 0.08653593063354492,
718
+ "MutateDataType": 0.001874685287475586,
719
+ "NeuronAliasDependencyInduction": 0.0008199214935302734,
720
+ "NeuronAliasDependencyReset": 0.09268832206726074,
721
+ "NeuronInstComb": 0.013442754745483398,
722
+ "NeuronLICM": 0.04093337059020996,
723
+ "NeuronLoopFusion": 0.07855010032653809,
724
+ "NeuronLoopInterchange": 0.0029878616333007813,
725
+ "NeuronSimplifier": 0.013553857803344727,
726
+ "NeuronSimplifyPredicates": 0.0043621063232421875,
727
+ "NeuronValueNumbering": 0.011638164520263672,
728
+ "OptimizeAliasedCopyChain": 0.001085042953491211,
729
+ "OptimizeNKIKernels": 0.4002358913421631,
730
+ "PAGLayoutOpt": 0.5899946689605713,
731
+ "PComputeCutting": 0.011747598648071289,
732
+ "PGLayoutTilingPipeline": 2.3099381923675537,
733
+ "PGTiling": 0.39591336250305176,
734
+ "PadElimination": 0.0018284320831298828,
735
+ "ParAxesAnnotation": 0.5343668460845947,
736
+ "PartialLoopFusion": 0.0648810863494873,
737
+ "PartialSimdFusion": 0.06934404373168945,
738
+ "PerfectLoopNest": 0.010063648223876953,
739
+ "RecognizeOpIdiom": 0.006760358810424805,
740
+ "Recompute": 0.0004215240478515625,
741
+ "RelaxPredicates": 0.004682064056396484,
742
+ "Rematerialization": 0.0020973682403564453,
743
+ "RemoveShardedPartitionAxes": 0.03322100639343262,
744
+ "ReshapeWeights": 0.005750894546508789,
745
+ "ResolveAccessConflict": 0.005618572235107422,
746
+ "ResolveComplicatePredicates": 0.0011665821075439453,
747
+ "RewriteReplicationMatmul": 0.0025589466094970703,
748
+ "RewriteWeights": 0.010002374649047852,
749
+ "SFKVectorizer": 0.2708115577697754,
750
+ "ShardingPropagationAnalysis": 0.04528522491455078,
751
+ "SimpleAllReduceTiling": 0.003036975860595703,
752
+ "Simplifier": 0.004547834396362305,
753
+ "SimplifyMacroPredicates": 0.0300595760345459,
754
+ "SimplifyNeuronTensor": 0.014966249465942383,
755
+ "SimplifySlice": 0.01027679443359375,
756
+ "SimplifyTensor": 0.020308732986450195,
757
+ "SpillPSum": 0.04539823532104492,
758
+ "SplitAPUnionSets": 0.023496150970458984,
759
+ "SplitAccGrp": 0.0026144981384277344,
760
+ "StaticProfiler": 0.006074190139770508,
761
+ "StaticTransposeLocalTensor": 0.006592273712158203,
762
+ "SundaISel": 0.06954693794250488,
763
+ "TCTransform": 0.001828908920288086,
764
+ "TensorInitialization": 0.00876927375793457,
765
+ "TensorOpSimplifier": 0.011527299880981445,
766
+ "TensorOpTransform": 0.03972220420837402,
767
+ "TileCCOps": 0.00546574592590332,
768
+ "TilingProfiler": 0.02742171287536621,
769
+ "TransformConvOp": 0.006824016571044922,
770
+ "TritiumFusion": 0.11011958122253418,
771
+ "ValueNumbering": 0.004981040954589844,
772
+ "VectorizeDMA": 0.03582024574279785,
773
+ "VectorizeMatMult": 0.0291445255279541,
774
+ "WeightCoalescing": 0.008509397506713867,
775
+ "ZeroSizeTensorElimination": 0.00014853477478027344
776
+ },
777
+ "tensorizer": {
778
+ "DMATilingProfiler::TotalInstructionsAfterTiling": 1813.0,
779
+ "StaticProfiler::AifUb": 76.42292022705078,
780
+ "StaticProfiler::ArithmeticIntensityTensorizer": 227.36143493652344,
781
+ "StaticProfiler::AverageDmaLength": 4034.3251953125,
782
+ "StaticProfiler::AverageFractalPeUtilization": 100.0,
783
+ "StaticProfiler::AveragePartitionUtilization": 99.65364074707031,
784
+ "StaticProfiler::AveragePeUtilization": 100.0,
785
+ "StaticProfiler::DDRTransferBytes": 63514120.0,
786
+ "StaticProfiler::InternalTransferBytes": 13500416.0,
787
+ "StaticProfiler::LoadExpanded": 10497.0,
788
+ "StaticProfiler::LocalizationEfficiency": 297.5042419433594,
789
+ "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 352.84381103515625,
790
+ "StaticProfiler::StoreExpanded": 2561.0,
791
+ "StaticProfiler::TotalDMAExpanded": 13058.0,
792
+ "StaticProfiler::TotalDynamicInstancesCount": 2025.0,
793
+ "StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 2025.0,
794
+ "StaticProfiler::TotalLNCComm": 0.0,
795
+ "StaticProfiler::TotalLNCCommTransfer": 0.0,
796
+ "TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0,
797
+ "TilingProfiler::AveragePeUtilizationAfterTiling": 0.0,
798
+ "TilingProfiler::BatchnormInstructionsAfterTiling": 0.0,
799
+ "TilingProfiler::DmaInstructionsAfterTiling": 0.0,
800
+ "TilingProfiler::GenericInstructionsAfterTiling": 16.0,
801
+ "TilingProfiler::MatMultInstructionsAfterTiling": 1280.0,
802
+ "TilingProfiler::NumPfTransposes": 7.0,
803
+ "TilingProfiler::NumPfTransposesForIo": 3.0,
804
+ "TilingProfiler::NumPfTransposesForLocal": 2.0,
805
+ "TilingProfiler::NumPfTransposesForNonlocal": 2.0,
806
+ "TilingProfiler::PfTransposeInstructions": 116.0,
807
+ "TilingProfiler::PfTransposeInstructionsForIo": 36.0,
808
+ "TilingProfiler::PfTransposeInstructionsForLocal": 16.0,
809
+ "TilingProfiler::PfTransposeInstructionsForNonlocal": 64.0,
810
+ "TilingProfiler::ReduceInstructionsAfterTiling": 0.0,
811
+ "TilingProfiler::SimdInstructionsAfterTiling": 113.0,
812
+ "TilingProfiler::TotalInstructionsAfterTiling": 0.0,
813
+ "TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0,
814
+ "TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0,
815
+ "TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0,
816
+ "TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0,
817
+ "TransformConvOp::conv2d_column_packing": 0.0,
818
+ "TransformConvOp::conv2d_column_packing_1": 0.0,
819
+ "TransformConvOp::conv2d_column_packing_io10": 0.0,
820
+ "TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0
821
+ }
822
+ },
823
+ "sg0002": {
824
+ "compiletime": {
825
+ "AGOrderingAnalysisPass": 0.08984947204589844,
826
+ "AffinePredicateResolution": 0.0009312629699707031,
827
+ "AliasDependencyElimination": 0.00024366378784179688,
828
+ "AliasDependencyInduction": 0.005263328552246094,
829
+ "AliasDependencyReset": 0.04176759719848633,
830
+ "BFComputeCutting": 0.002216339111328125,
831
+ "BirCodeGenLoop": 0.3660314083099365,
832
+ "CCOpFusion": 0.04759931564331055,
833
+ "CanonicalizeDAGForPGTiling": 0.006819009780883789,
834
+ "CanonicalizeIR": 0.0015099048614501953,
835
+ "CoalesceCCOp": 0.007388591766357422,
836
+ "CommuteConcat": 0.0021598339080810547,
837
+ "DMALocalityOpt": 0.002432584762573242,
838
+ "DMAProfiler": 0.022784948348999023,
839
+ "DMATilingProfiler": 0.007287263870239258,
840
+ "DataLocalityOpt": 0.15184760093688965,
841
+ "DataStreaming": 0.007554292678833008,
842
+ "DeConcat": 0.0052378177642822266,
843
+ "DeadCodeElimination": 0.0020182132720947266,
844
+ "DeadStoreElimination": 0.007268428802490234,
845
+ "DelinearIndices": 0.006491422653198242,
846
+ "Delinearization": 0.00418853759765625,
847
+ "DelinearizeSPMD": 0.03150320053100586,
848
+ "DoNothing": 8.726119995117188e-05,
849
+ "DramToDramTranspose": 0.028717756271362305,
850
+ "DumpGraphAndMetadata": 0.04632568359375,
851
+ "EliminateDivs": 0.0021729469299316406,
852
+ "ExpandBatchNorm": 0.0017549991607666016,
853
+ "ExpandISAMacro": 0.0053784847259521484,
854
+ "FactorizeBlkDims": 0.046364784240722656,
855
+ "FactorizeThreadAxesInFreeDims": 0.0036237239837646484,
856
+ "FlattenMacroLoop": 0.012475728988647461,
857
+ "GenericAccessSimplifier": 0.0007128715515136719,
858
+ "InferInitValue": 0.11746096611022949,
859
+ "InferIntrinsicOnCC": 0.008626222610473633,
860
+ "InferNeuronTensor": 0.17520785331726074,
861
+ "InferNonlocalTensors": 0.02865004539489746,
862
+ "InferPSumTensor": 0.07464981079101563,
863
+ "InferShardAxis": 0.2832298278808594,
864
+ "InferSharedMemLoc": 0.01778268814086914,
865
+ "InlineNativeKernels": 0.0025413036346435547,
866
+ "InsertCoreBarrier": 0.007167816162109375,
867
+ "InsertIOTransposes": 0.058136701583862305,
868
+ "InsertImplicitShardAxisBeforeISel": 0.024377822875976563,
869
+ "InsertLocalTransposes": 0.016265153884887695,
870
+ "InsertOffloadedTransposes": 0.03376030921936035,
871
+ "LICM": 0.015621185302734375,
872
+ "LateLegalizeInst": 0.018033266067504883,
873
+ "LateLegalizePostSplit": 0.01734447479248047,
874
+ "LateLowerReshapeOp": 0.0016047954559326172,
875
+ "LateLowerTensorOp": 0.0011878013610839844,
876
+ "LateNeuronInstComb": 0.05313730239868164,
877
+ "LayoutPreprocessing": 0.05620622634887695,
878
+ "LayoutPreprocessingAndAnalysis": 0.18100428581237793,
879
+ "LayoutRequirementAnalysis": 0.014584064483642578,
880
+ "LegalizeCCOpLayout": 0.0032541751861572266,
881
+ "LegalizeOpLevelAlias": 0.0010030269622802734,
882
+ "LegalizePartitionReduce": 0.002452373504638672,
883
+ "LegalizeSundaAccess": 0.040776968002319336,
884
+ "LegalizeSundaMacro": 0.0427708625793457,
885
+ "LegalizeType": 0.016519784927368164,
886
+ "LocalLayoutOpt": 0.014898538589477539,
887
+ "LoopFusion": 0.005176067352294922,
888
+ "LoopSplitting": 0.00048732757568359375,
889
+ "LowerBroadcast": 0.004655599594116211,
890
+ "LowerCCOpBlockAxis": 0.004888296127319336,
891
+ "LowerComplexBroadcast": 0.010831594467163086,
892
+ "LowerIntrinsics": 0.03900289535522461,
893
+ "LowerShardAxis": 0.017355918884277344,
894
+ "LowerTensorOp": 0.013428449630737305,
895
+ "LowerToSendRecv": 0.038613319396972656,
896
+ "LowerTranspose": 0.050206661224365234,
897
+ "MacroGeneration": 0.1058506965637207,
898
+ "MaskPropagation": 0.004538536071777344,
899
+ "MemcpyElimination": 0.04629826545715332,
900
+ "MutateDataType": 0.0012559890747070313,
901
+ "NeuronAliasDependencyInduction": 0.0006165504455566406,
902
+ "NeuronAliasDependencyReset": 0.03877615928649902,
903
+ "NeuronInstComb": 0.02690267562866211,
904
+ "NeuronLICM": 0.024822473526000977,
905
+ "NeuronLoopFusion": 0.08438324928283691,
906
+ "NeuronLoopInterchange": 0.0028100013732910156,
907
+ "NeuronSimplifier": 0.0370326042175293,
908
+ "NeuronSimplifyPredicates": 0.017668962478637695,
909
+ "NeuronValueNumbering": 0.006052970886230469,
910
+ "OptimizeAliasedCopyChain": 0.0005040168762207031,
911
+ "OptimizeNKIKernels": 4.637849807739258,
912
+ "PAGLayoutOpt": 0.15427088737487793,
913
+ "PComputeCutting": 0.022019147872924805,
914
+ "PGLayoutTilingPipeline": 1.5585658550262451,
915
+ "PGTiling": 0.3059046268463135,
916
+ "PadElimination": 0.00058746337890625,
917
+ "ParAxesAnnotation": 0.07737350463867188,
918
+ "PartialLoopFusion": 0.03046131134033203,
919
+ "PartialSimdFusion": 0.008630514144897461,
920
+ "PerfectLoopNest": 0.0037374496459960938,
921
+ "RecognizeOpIdiom": 0.0049936771392822266,
922
+ "Recompute": 0.0004494190216064453,
923
+ "RelaxPredicates": 0.00769495964050293,
924
+ "Rematerialization": 0.0034401416778564453,
925
+ "RemoveShardedPartitionAxes": 0.008293628692626953,
926
+ "ReshapeWeights": 0.004475116729736328,
927
+ "ResolveAccessConflict": 0.0053598880767822266,
928
+ "ResolveComplicatePredicates": 0.0009164810180664063,
929
+ "RewriteReplicationMatmul": 0.00577545166015625,
930
+ "RewriteWeights": 0.010277271270751953,
931
+ "SFKVectorizer": 0.19967889785766602,
932
+ "ShardingPropagationAnalysis": 0.06793785095214844,
933
+ "SimpleAllReduceTiling": 0.004133701324462891,
934
+ "Simplifier": 0.0029976367950439453,
935
+ "SimplifyMacroPredicates": 0.025454998016357422,
936
+ "SimplifyNeuronTensor": 0.029609203338623047,
937
+ "SimplifySlice": 0.0008246898651123047,
938
+ "SimplifyTensor": 0.03260469436645508,
939
+ "SpillPSum": 0.01929450035095215,
940
+ "SplitAPUnionSets": 0.08632850646972656,
941
+ "SplitAccGrp": 0.002518892288208008,
942
+ "StaticProfiler": 0.026699542999267578,
943
+ "StaticTransposeLocalTensor": 0.009710550308227539,
944
+ "SundaISel": 0.08615612983703613,
945
+ "TCTransform": 0.0014863014221191406,
946
+ "TensorInitialization": 0.017354965209960938,
947
+ "TensorOpSimplifier": 0.004897356033325195,
948
+ "TensorOpTransform": 0.026237010955810547,
949
+ "TileCCOps": 0.007733821868896484,
950
+ "TilingProfiler": 0.03455352783203125,
951
+ "TransformConvOp": 0.0042724609375,
952
+ "TritiumFusion": 0.11825895309448242,
953
+ "ValueNumbering": 0.0019876956939697266,
954
+ "VectorizeDMA": 0.03213214874267578,
955
+ "VectorizeMatMult": 0.010382413864135742,
956
+ "WeightCoalescing": 0.003669261932373047,
957
+ "ZeroSizeTensorElimination": 0.00017881393432617188
958
+ },
959
+ "tensorizer": {
960
+ "DMATilingProfiler::TotalInstructionsAfterTiling": 20919.0,
961
+ "StaticProfiler::AifUb": 147.03309631347656,
962
+ "StaticProfiler::ArithmeticIntensityTensorizer": 141.05162048339844,
963
+ "StaticProfiler::AverageDmaLength": 2425.82958984375,
964
+ "StaticProfiler::AverageFractalPeUtilization": 98.71436309814453,
965
+ "StaticProfiler::AveragePartitionUtilization": 94.08551025390625,
966
+ "StaticProfiler::AveragePeUtilization": 96.60899353027344,
967
+ "StaticProfiler::DDRTransferBytes": 365941792.0,
968
+ "StaticProfiler::InternalTransferBytes": 325506848.0,
969
+ "StaticProfiler::LoadExpanded": 84060.0,
970
+ "StaticProfiler::LocalizationEfficiency": 95.931884765625,
971
+ "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 99.52960968017578,
972
+ "StaticProfiler::StoreExpanded": 1898.0,
973
+ "StaticProfiler::TotalDMAExpanded": 85958.0,
974
+ "StaticProfiler::TotalDynamicInstancesCount": 25383.0,
975
+ "StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 24932.0,
976
+ "StaticProfiler::TotalLNCComm": 0.0,
977
+ "StaticProfiler::TotalLNCCommTransfer": 0.0,
978
+ "TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0,
979
+ "TilingProfiler::AveragePeUtilizationAfterTiling": 0.0,
980
+ "TilingProfiler::BatchnormInstructionsAfterTiling": 0.0,
981
+ "TilingProfiler::DmaInstructionsAfterTiling": 0.0,
982
+ "TilingProfiler::GenericInstructionsAfterTiling": 4.0,
983
+ "TilingProfiler::MatMultInstructionsAfterTiling": 10464.0,
984
+ "TilingProfiler::NumPfTransposes": 6.0,
985
+ "TilingProfiler::NumPfTransposesForIo": 1.0,
986
+ "TilingProfiler::NumPfTransposesForLocal": 1.0,
987
+ "TilingProfiler::NumPfTransposesForNonlocal": 4.0,
988
+ "TilingProfiler::PfTransposeInstructions": 10195.0,
989
+ "TilingProfiler::PfTransposeInstructionsForIo": 9504.0,
990
+ "TilingProfiler::PfTransposeInstructionsForLocal": 1.0,
991
+ "TilingProfiler::PfTransposeInstructionsForNonlocal": 690.0,
992
+ "TilingProfiler::ReduceInstructionsAfterTiling": 4.0,
993
+ "TilingProfiler::SimdInstructionsAfterTiling": 92.0,
994
+ "TilingProfiler::TotalInstructionsAfterTiling": 0.0,
995
+ "TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0,
996
+ "TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0,
997
+ "TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0,
998
+ "TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0,
999
+ "TransformConvOp::conv2d_column_packing": 0.0,
1000
+ "TransformConvOp::conv2d_column_packing_1": 0.0,
1001
+ "TransformConvOp::conv2d_column_packing_io10": 0.0,
1002
+ "TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0
1003
+ }
1004
+ },
1005
+ "sg01": {
1006
+ "compiletime": {
1007
+ "CanonicalizeConv": 2.499999936844688e-05,
1008
+ "CanonicalizeForTensorizer": 1.1000000085914508e-05,
1009
+ "Canonicalizer": 0.00020599999697878957,
1010
+ "HoistCompute": 1.9999999949504854e-06,
1011
+ "IdentifyCrossPassTensors": 1.2000000424450263e-05,
1012
+ "MemcastMotion": 7.999999979801942e-06,
1013
+ "PenguinizeFunctions": 9.999999747378752e-06,
1014
+ "PruneFunctions": 1.4999999621068127e-05,
1015
+ "RemoveOptimizationBarriers": 1.9999999494757503e-05,
1016
+ "ScatterMotion": 1.9999999949504854e-06,
1017
+ "TensorizerLegalizationPass": 1.4000000192027073e-05,
1018
+ "VerifySupportedOps": 9.999999747378752e-06,
1019
+ "algsimp": 4.999999873689376e-05,
1020
+ "batchnorm_expander": 1.1000000085914508e-05,
1021
+ "boundary-marker-removal": 3.999999989900971e-06,
1022
+ "call-inliner": 7.999999979801942e-06,
1023
+ "canonicalize-boundary-marker": 4.999999873689376e-06,
1024
+ "collective-stream-id-checker": 3.000000106112566e-06,
1025
+ "comparison-expander": 3.999999989900971e-06,
1026
+ "computation-deduplicator": 1.8000000636675395e-05,
1027
+ "config-lowering": 3.400000059627928e-05,
1028
+ "constant_folding": 9.000000318337698e-06,
1029
+ "cse": 9.999999747378752e-06,
1030
+ "dce": 9.999999974752427e-07,
1031
+ "dynamic-slice-transpose": 4.999999873689376e-06,
1032
+ "eliminate-redundant-compare": 1.2999999853491317e-05,
1033
+ "emit-offloaded-dropout": 1.2999999853491317e-05,
1034
+ "flatten-call-graph": 7.000000096013537e-06,
1035
+ "fuse-send-recv": 1.8000000636675395e-05,
1036
+ "hilo-conditional-to-select": 4.999999873689376e-06,
1037
+ "hilo::LegalizeAlias": 3.999999989900971e-06,
1038
+ "hilo::NeuronInstCombine": 5.6000000768108293e-05,
1039
+ "hilo::NeuronOpFusion": 2.300000051036477e-05,
1040
+ "hilo::ReplaceTokenTypeWithU8Pass": 1.2000000424450263e-05,
1041
+ "hilo::ScheduleFusion": 9.999999974752427e-07,
1042
+ "hilo::SixtyFourHack": 7.999999979801942e-06,
1043
+ "hilo::VerifyAliasing": 1.9999999949504854e-06,
1044
+ "hlo-mac-count": 8.900000102585182e-05,
1045
+ "legalize-ccops-for-tensorizer": 9.999999974752427e-07,
1046
+ "legalize-compare": 3.999999989900971e-06,
1047
+ "lower-argminmax-custom-call": 3.999999989900971e-06,
1048
+ "map-inline": 9.999999747378752e-06,
1049
+ "metadata-naming": 1.700000029813964e-05,
1050
+ "mlir::detail::OpToOpPassAdaptor": 1.8999999156221747e-05,
1051
+ "mlir::hlo::MhloToPyPenguin": 0.0009159999899566174,
1052
+ "mlir::mhlo::LowerComplexExtraPass": 6.900000153109431e-05,
1053
+ "mlir::mhlo::LowerComplexPass": 0.00011800000356743112,
1054
+ "native-to-custom-softmax": 4.999999873689376e-06,
1055
+ "native-to-custom-softmax-dx": 1.2999999853491317e-05,
1056
+ "neuron-hlo-verifier": 0.00035600000410340726,
1057
+ "operand_upcaster": 1.2000000424450263e-05,
1058
+ "post-par-pipe-begin": 9.999999974752427e-07,
1059
+ "post-par-pipe-end": 0.0,
1060
+ "post-partition-simplification": 0.0004619999963324517,
1061
+ "replace-minimum-constant": 6.000000212225132e-06,
1062
+ "reshape-mover": 3.000000106112566e-06,
1063
+ "simplify-concat": 4.199999966658652e-05,
1064
+ "simplify-while-loops": 1.9999999949504854e-06,
1065
+ "transform-variadic-reduce": 7.000000096013537e-06,
1066
+ "tuple-simplifier": 4.999999873689376e-06,
1067
+ "unpack-nested-aws-ntwsr": 3.000000106112566e-06,
1068
+ "unroll-while-loop": 0.0
1069
+ },
1070
+ "hilo": {
1071
+ "ArithmeticIntensity": 105.0946273803711,
1072
+ "HloMacCount": 6509559808.0,
1073
+ "Traffic": 123879968.0
1074
+ }
1075
+ },
1076
+ "sg02": {
1077
+ "compiletime": {
1078
+ "CanonicalizeConv": 0.0,
1079
+ "CanonicalizeForTensorizer": 1.2999999853491317e-05,
1080
+ "Canonicalizer": 0.0002699999895412475,
1081
+ "HoistCompute": 4.999999873689376e-06,
1082
+ "IdentifyCrossPassTensors": 1.2000000424450263e-05,
1083
+ "MemcastMotion": 3.000000106112566e-06,
1084
+ "PenguinizeFunctions": 1.2000000424450263e-05,
1085
+ "PruneFunctions": 1.700000029813964e-05,
1086
+ "RemoveOptimizationBarriers": 7.000000096013537e-06,
1087
+ "ScatterMotion": 0.0,
1088
+ "TensorizerLegalizationPass": 7.999999979801942e-06,
1089
+ "VerifySupportedOps": 1.2000000424450263e-05,
1090
+ "algsimp": 4.8000001697801054e-05,
1091
+ "batchnorm_expander": 1.2999999853491317e-05,
1092
+ "boundary-marker-removal": 3.999999989900971e-06,
1093
+ "call-inliner": 9.000000318337698e-06,
1094
+ "canonicalize-boundary-marker": 4.999999873689376e-06,
1095
+ "collective-stream-id-checker": 3.000000106112566e-06,
1096
+ "comparison-expander": 2.4000000848900527e-05,
1097
+ "computation-deduplicator": 2.300000051036477e-05,
1098
+ "config-lowering": 3.400000059627928e-05,
1099
+ "constant_folding": 7.000000096013537e-06,
1100
+ "cse": 1.2999999853491317e-05,
1101
+ "dce": 9.999999974752427e-07,
1102
+ "dynamic-slice-transpose": 4.999999873689376e-06,
1103
+ "eliminate-redundant-compare": 3.000000106112566e-06,
1104
+ "emit-offloaded-dropout": 1.2000000424450263e-05,
1105
+ "flatten-call-graph": 1.1000000085914508e-05,
1106
+ "fuse-send-recv": 1.8999999156221747e-05,
1107
+ "hilo-conditional-to-select": 4.999999873689376e-06,
1108
+ "hilo::LegalizeAlias": 1.9999999949504854e-06,
1109
+ "hilo::NeuronInstCombine": 6.399999983841553e-05,
1110
+ "hilo::NeuronOpFusion": 9.999999747378752e-06,
1111
+ "hilo::ReplaceTokenTypeWithU8Pass": 1.8999999156221747e-05,
1112
+ "hilo::ScheduleFusion": 1.9999999949504854e-06,
1113
+ "hilo::SixtyFourHack": 4.400000034365803e-05,
1114
+ "hilo::VerifyAliasing": 9.999999974752427e-07,
1115
+ "hlo-mac-count": 0.004759000148624182,
1116
+ "legalize-ccops-for-tensorizer": 9.999999974752427e-07,
1117
+ "legalize-compare": 3.000000106112566e-06,
1118
+ "lower-argminmax-custom-call": 3.000000106112566e-06,
1119
+ "map-inline": 1.2999999853491317e-05,
1120
+ "metadata-naming": 1.5999999959603883e-05,
1121
+ "mlir::detail::OpToOpPassAdaptor": 2.4000000848900527e-05,
1122
+ "mlir::hlo::MhloToPyPenguin": 0.005001000128686428,
1123
+ "mlir::mhlo::LowerComplexExtraPass": 7.79999973019585e-05,
1124
+ "mlir::mhlo::LowerComplexPass": 1.1000000085914508e-05,
1125
+ "native-to-custom-softmax": 4.999999873689376e-06,
1126
+ "native-to-custom-softmax-dx": 1.2999999853491317e-05,
1127
+ "neuron-hlo-verifier": 0.0003600000054575503,
1128
+ "operand_upcaster": 1.2000000424450263e-05,
1129
+ "post-par-pipe-begin": 9.999999974752427e-07,
1130
+ "post-par-pipe-end": 0.0,
1131
+ "post-partition-simplification": 0.00044999999227002263,
1132
+ "replace-minimum-constant": 7.000000096013537e-06,
1133
+ "reshape-mover": 3.000000106112566e-06,
1134
+ "simplify-concat": 3.199999991920777e-05,
1135
+ "simplify-while-loops": 1.9999999949504854e-06,
1136
+ "transform-variadic-reduce": 4.8000001697801054e-05,
1137
+ "tuple-simplifier": 3.999999989900971e-06,
1138
+ "unpack-nested-aws-ntwsr": 4.999999873689376e-06,
1139
+ "unroll-while-loop": 0.0
1140
+ },
1141
+ "hilo": {
1142
+ "ArithmeticIntensity": 28.312292098999023,
1143
+ "HloMacCount": 4988469248.0,
1144
+ "Traffic": 352388928.0
1145
+ }
1146
+ },
1147
+ "topk": {
1148
+ "compiletime": {
1149
+ "CoalesceCCOp": 0.006628990173339844,
1150
+ "DMALocalityOpt": 0.003807544708251953,
1151
+ "DMAProfiler": 0.007816553115844727,
1152
+ "DataStreaming": 0.022742509841918945,
1153
+ "DoNothing": 0.00023865699768066406,
1154
+ "ExpandISAMacro": 0.0065212249755859375,
1155
+ "FactorizeBlkDims": 0.026747465133666992,
1156
+ "InferPSumTensor": 0.02189779281616211,
1157
+ "InferSharedMemLoc": 0.0063364505767822266,
1158
+ "InsertCoreBarrier": 0.006017923355102539,
1159
+ "LateLegalizeInst": 0.014620304107666016,
1160
+ "LateNeuronInstComb": 0.0202789306640625,
1161
+ "LegalizeSundaAccess": 0.028186798095703125,
1162
+ "LegalizeType": 0.019533634185791016,
1163
+ "LowerBroadcast": 0.013374805450439453,
1164
+ "LowerIntrinsics": 0.010933876037597656,
1165
+ "LowerTranspose": 0.00744938850402832,
1166
+ "NeuronInstComb": 0.02654409408569336,
1167
+ "NeuronLICM": 0.021889686584472656,
1168
+ "NeuronSimplifyPredicates": 0.006708860397338867,
1169
+ "NeuronValueNumbering": 0.007520914077758789,
1170
+ "SFKVectorizer": 0.06228280067443848,
1171
+ "SimpleAllReduceTiling": 0.00653386116027832,
1172
+ "SimplifyNeuronTensor": 0.09801602363586426,
1173
+ "SpillPSum": 0.04999828338623047,
1174
+ "WeightCoalescing": 0.0065784454345703125
1175
+ }
1176
+ }
1177
+ }
context_encoding_model/_tp0_bk1/graph.neff ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b96dea22dba97fdfefb2f26f7ad03c509af0a395c08e4bfb143ff14bd673c826
3
+ size 1229824
context_encoding_model/_tp0_bk1/log-neuron-cc.txt ADDED
The diff for this file is too large to render. See raw diff
 
context_encoding_model/_tp0_bk1/metaneff.pb ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:456dc08330072407208f8e4a41b70cc9190b30d05dced01f768e2bbc43e5076d
3
+ size 2438380
context_encoding_model/_tp0_bk1/model.MODULE_2330bfb0632c950ddab1+62ecd68b.hlo_module.pb ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c4a7565239b86e91fc95d8ad2ceb0bdd0fa2489c90c536cf87cd40f007ac5d60
3
+ size 2525166
context_encoding_model/_tp0_bk1/model.MODULE_2330bfb0632c950ddab1+62ecd68b.neff ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b96dea22dba97fdfefb2f26f7ad03c509af0a395c08e4bfb143ff14bd673c826
3
+ size 1229824
context_encoding_model/_tp0_bk1/neuron_config.json ADDED
@@ -0,0 +1,224 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_attn_implementation_autoset": false,
3
+ "_name_or_path": "/home/ubuntu/models/Qwen3-1.7B",
4
+ "add_cross_attention": false,
5
+ "architectures": [
6
+ "Qwen3ForCausalLM"
7
+ ],
8
+ "attention_bias": false,
9
+ "attention_dropout": 0.0,
10
+ "attribute_map": {},
11
+ "bad_words_ids": null,
12
+ "begin_suppress_tokens": null,
13
+ "bos_token_id": 151643,
14
+ "chunk_size_feed_forward": 0,
15
+ "cross_attention_hidden_size": null,
16
+ "decoder_start_token_id": null,
17
+ "diversity_penalty": 0.0,
18
+ "do_sample": false,
19
+ "early_stopping": false,
20
+ "encoder_no_repeat_ngram_size": 0,
21
+ "eos_token_id": 151645,
22
+ "exponential_decay_length_penalty": null,
23
+ "finetuning_task": null,
24
+ "forced_bos_token_id": null,
25
+ "forced_eos_token_id": null,
26
+ "fused_spec_config": null,
27
+ "head_dim": 128,
28
+ "hidden_act": "silu",
29
+ "hidden_size": 2048,
30
+ "id2label": {
31
+ "0": "LABEL_0",
32
+ "1": "LABEL_1"
33
+ },
34
+ "initializer_range": 0.02,
35
+ "intermediate_size": 6144,
36
+ "is_decoder": false,
37
+ "is_encoder_decoder": false,
38
+ "label2id": {
39
+ "LABEL_0": 0,
40
+ "LABEL_1": 1
41
+ },
42
+ "length_penalty": 1.0,
43
+ "max_length": 20,
44
+ "max_position_embeddings": 40960,
45
+ "max_window_layers": 28,
46
+ "metadata": null,
47
+ "min_length": 0,
48
+ "model_type": "qwen3",
49
+ "neuron_config": {
50
+ "activation_quantization_type": null,
51
+ "allow_input_truncation": false,
52
+ "apply_seq_ids_mask": false,
53
+ "async_mode": false,
54
+ "attention_dp_degree": 1,
55
+ "attention_dtype": null,
56
+ "attn_block_cte_nki_kernel_enabled": false,
57
+ "attn_block_tkg_nki_kernel_cache_update": false,
58
+ "attn_block_tkg_nki_kernel_cascaded_attention": false,
59
+ "attn_block_tkg_nki_kernel_enabled": false,
60
+ "attn_cls": {
61
+ "__module__": "neuronx_distributed_inference.models.qwen3.modeling_qwen3",
62
+ "__name__": "NeuronQwen3Attention"
63
+ },
64
+ "attn_kernel_enabled": null,
65
+ "attn_tkg_builtin_kernel_enabled": false,
66
+ "attn_tkg_nki_kernel_enabled": false,
67
+ "batch_size": 1,
68
+ "bucket_n_active_tokens": true,
69
+ "buckets": [
70
+ 256
71
+ ],
72
+ "cast_type": "config",
73
+ "cc_pipeline_tiling_factor": 2,
74
+ "chunked_prefill_config": null,
75
+ "context_encoding_buckets": [
76
+ 256
77
+ ],
78
+ "cp_degree": 1,
79
+ "ctx_batch_size": 1,
80
+ "disable_kv_cache_tiling": false,
81
+ "draft_model_modules_to_not_convert": null,
82
+ "enable_bucketing": true,
83
+ "enable_cte_modular_flow": false,
84
+ "enable_eagle_draft_input_norm": false,
85
+ "enable_eagle_speculation": false,
86
+ "enable_fused_speculation": false,
87
+ "enable_long_context_mode": false,
88
+ "enable_output_completion_notifications": false,
89
+ "enable_spill_reload_dge": false,
90
+ "enable_token_tree": false,
91
+ "ep_degree": 1,
92
+ "expert_mlp_nki_kernel_enabled": null,
93
+ "flash_decoding_enabled": false,
94
+ "fused_qkv": false,
95
+ "fused_rmsnorm_skip_gamma": false,
96
+ "is_block_kv_layout": null,
97
+ "is_chunked_prefill": false,
98
+ "is_continuous_batching": true,
99
+ "is_eagle_draft": false,
100
+ "is_medusa": false,
101
+ "is_prefill_stage": true,
102
+ "is_prefix_caching": false,
103
+ "k_cache_transposed": false,
104
+ "kv_cache_batch_size": 8,
105
+ "kv_cache_padding_size": 0,
106
+ "kv_cache_quant": false,
107
+ "kv_cache_tiling": false,
108
+ "layer_boundary_markers": false,
109
+ "lm_head_pad": true,
110
+ "lm_head_pad_alignment_size": 1,
111
+ "local_ranks_size": 2,
112
+ "logical_nc_config": 2,
113
+ "lora_config": null,
114
+ "max_batch_size": 8,
115
+ "max_context_length": 4096,
116
+ "max_length": 4096,
117
+ "max_new_tokens": null,
118
+ "medusa_speculation_length": 0,
119
+ "medusa_tree": null,
120
+ "mlp_kernel_enabled": false,
121
+ "mlp_kernel_fuse_residual_add": false,
122
+ "modules_to_not_convert": null,
123
+ "moe_fused_nki_kernel_enabled": null,
124
+ "n_active_tokens": 4096,
125
+ "n_positions": 4096,
126
+ "num_medusa_heads": 0,
127
+ "on_cpu": false,
128
+ "on_device_sampling_config": {
129
+ "deterministic": false,
130
+ "do_sample": false,
131
+ "dynamic": true,
132
+ "global_topk": 256,
133
+ "on_device_sampling_config": true,
134
+ "temperature": 1.0,
135
+ "top_k": 1,
136
+ "top_k_kernel_enabled": false,
137
+ "top_p": 1.0
138
+ },
139
+ "output_logits": false,
140
+ "overrides_torch_dtype": true,
141
+ "pa_block_size": 4096,
142
+ "pa_num_blocks": 8,
143
+ "padding_side": "right",
144
+ "pp_degree": 1,
145
+ "prefix_buckets": null,
146
+ "qk_layernorm": false,
147
+ "qkv_kernel_enabled": false,
148
+ "qkv_kernel_fuse_residual_add": false,
149
+ "qkv_kernel_nbsd_layout": false,
150
+ "quantization_dtype": "int8",
151
+ "quantization_type": "per_tensor_symmetric",
152
+ "quantize_clamp_bound": Infinity,
153
+ "quantized": false,
154
+ "quantized_checkpoints_path": null,
155
+ "quantized_mlp_kernel_enabled": false,
156
+ "rmsnorm_quantize_kernel_enabled": false,
157
+ "router_topk_nki_kernel_enabled": null,
158
+ "rpl_reduce_dtype": null,
159
+ "save_sharded_checkpoint": true,
160
+ "scratchpad_page_size": null,
161
+ "seq_len": 4096,
162
+ "seq_len_threshold_for_cc_tiling": 16384,
163
+ "sequence_parallel_enabled": false,
164
+ "shared_mlp_nki_kernel_enabled": null,
165
+ "skip_sharding": false,
166
+ "skip_warmup": false,
167
+ "spec_batch_size": 8,
168
+ "speculation_length": 0,
169
+ "start_rank_id": 0,
170
+ "strided_context_parallel_kernel_enabled": false,
171
+ "target": null,
172
+ "tensor_capture_config": null,
173
+ "tile_cc": false,
174
+ "tkg_batch_size": 8,
175
+ "token_generation_buckets": null,
176
+ "token_tree_config": null,
177
+ "torch_dtype": "bfloat16",
178
+ "tp_degree": 2,
179
+ "vocab_parallel": false,
180
+ "weight_gather_seq_len_threshold": 32768,
181
+ "weights_to_skip_layout_optimization": [],
182
+ "world_size": 2
183
+ },
184
+ "no_repeat_ngram_size": 0,
185
+ "num_attention_heads": 16,
186
+ "num_beam_groups": 1,
187
+ "num_beams": 1,
188
+ "num_cores_per_group": 1,
189
+ "num_hidden_layers": 28,
190
+ "num_key_value_heads": 8,
191
+ "num_return_sequences": 1,
192
+ "output_attentions": false,
193
+ "output_hidden_states": false,
194
+ "output_scores": false,
195
+ "pad_token_id": 0,
196
+ "prefix": null,
197
+ "problem_type": null,
198
+ "pruned_heads": {},
199
+ "remove_invalid_values": false,
200
+ "repetition_penalty": 1.0,
201
+ "return_dict": true,
202
+ "return_dict_in_generate": false,
203
+ "rms_norm_eps": 1e-06,
204
+ "rope_scaling": null,
205
+ "rope_theta": 1000000,
206
+ "sep_token_id": null,
207
+ "sliding_window": null,
208
+ "suppress_tokens": null,
209
+ "task_specific_params": null,
210
+ "temperature": 1.0,
211
+ "tf_legacy_loss": false,
212
+ "tie_encoder_decoder": false,
213
+ "tie_word_embeddings": true,
214
+ "tokenizer_class": null,
215
+ "top_k": 50,
216
+ "top_p": 1.0,
217
+ "torchscript": false,
218
+ "transformers_version": "4.51.0",
219
+ "typical_p": 1.0,
220
+ "use_bfloat16": false,
221
+ "use_cache": true,
222
+ "use_sliding_window": false,
223
+ "vocab_size": 151936
224
+ }
context_encoding_model/_tp0_bk2/command.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ neuronx-cc compile --framework=XLA model.MODULE_49bb42f69f5b159ae769+3467f95e.hlo_module.pb --output model.MODULE_49bb42f69f5b159ae769+3467f95e.neff --target=trn2 --auto-cast=none --model-type=transformer '--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma ' --lnc=2 -O1 '--internal-hlo2tensorizer-options= --modular-flow-mac-threshold=10 --verify-hlo=true' --logfile=log-neuron-cc.txt --verbose=35
context_encoding_model/_tp0_bk2/compile_flags.MODULE_49bb42f69f5b159ae769+3467f95e.json ADDED
@@ -0,0 +1 @@
 
 
1
+ ["--target=trn2", "--auto-cast=none", "--model-type=transformer", "--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma ", "--lnc=2", "-O1", "--internal-hlo2tensorizer-options= --modular-flow-mac-threshold=10 --verify-hlo=true", "--logfile=/home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk2/log-neuron-cc.txt"]
context_encoding_model/_tp0_bk2/global_metric_store.json ADDED
@@ -0,0 +1,1177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "Average": {
3
+ "tensorizer": {
4
+ "StaticProfiler::AverageFractalPeUtilization": 98.77135467529297,
5
+ "StaticProfiler::AveragePartitionUtilization": 94.32398223876953,
6
+ "StaticProfiler::AveragePeUtilization": 96.75625610351563,
7
+ "StaticProfiler::LocalizationEfficiency": 86.58112335205078,
8
+ "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 89.48306274414063,
9
+ "TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0,
10
+ "TilingProfiler::AveragePeUtilizationAfterTiling": 0.0
11
+ }
12
+ },
13
+ "Count": {
14
+ "tensorizer": {
15
+ "StaticProfiler::AverageFractalPeUtilization": 1.0,
16
+ "StaticProfiler::AveragePartitionUtilization": 1.0,
17
+ "StaticProfiler::AveragePeUtilization": 1.0,
18
+ "StaticProfiler::LocalizationEfficiency": 1.0,
19
+ "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 1.0,
20
+ "TilingProfiler::AveragePartitionUtilizationAfterTiling": 1.0,
21
+ "TilingProfiler::AveragePeUtilizationAfterTiling": 1.0
22
+ }
23
+ },
24
+ "Sum": {
25
+ "compiletime": {
26
+ "AGOrderingAnalysisPass": 0.07081985473632813,
27
+ "AffinePredicateResolution": 0.001847982406616211,
28
+ "AliasDependencyElimination": 0.0017039775848388672,
29
+ "AliasDependencyInduction": 0.016176223754882813,
30
+ "AliasDependencyReset": 0.0533907413482666,
31
+ "BFComputeCutting": 0.002690553665161133,
32
+ "BirCodeGenLoop": 0.436786413192749,
33
+ "CCOpFusion": 0.05509161949157715,
34
+ "CanonicalizeConv": 2.099999983329326e-05,
35
+ "CanonicalizeDAGForPGTiling": 0.01196432113647461,
36
+ "CanonicalizeForTensorizer": 4.0000002627493814e-05,
37
+ "CanonicalizeIR": 0.002866029739379883,
38
+ "Canonicalizer": 0.000770999991800636,
39
+ "CoalesceCCOp": 0.02091670036315918,
40
+ "CommuteConcat": 0.0016961097717285156,
41
+ "DMALocalityOpt": 0.012746095657348633,
42
+ "DMAProfiler": 0.025209903717041016,
43
+ "DMATilingProfiler": 0.013326406478881836,
44
+ "DataLocalityOpt": 0.13399314880371094,
45
+ "DataStreaming": 0.02252793312072754,
46
+ "DeConcat": 0.003023386001586914,
47
+ "DeadCodeElimination": 0.006216287612915039,
48
+ "DeadStoreElimination": 0.01400136947631836,
49
+ "DelinearIndices": 0.014129638671875,
50
+ "Delinearization": 0.004580259323120117,
51
+ "DelinearizeSPMD": 0.02204442024230957,
52
+ "DoNothing": 0.0005753040313720703,
53
+ "DramToDramTranspose": 0.0199737548828125,
54
+ "DumpGraphAndMetadata": 0.037271738052368164,
55
+ "EliminateDivs": 0.0025110244750976563,
56
+ "ExpandBatchNorm": 0.002251148223876953,
57
+ "ExpandISAMacro": 0.012173652648925781,
58
+ "FactorizeBlkDims": 0.041153669357299805,
59
+ "FactorizeThreadAxesInFreeDims": 0.0031156539916992188,
60
+ "FlattenMacroLoop": 0.005499601364135742,
61
+ "GenericAccessSimplifier": 0.004717350006103516,
62
+ "HoistCompute": 6.999999641266186e-06,
63
+ "IdentifyCrossPassTensors": 4.70000013592653e-05,
64
+ "InferInitValue": 0.046659231185913086,
65
+ "InferIntrinsicOnCC": 0.039793968200683594,
66
+ "InferNeuronTensor": 0.03774452209472656,
67
+ "InferNonlocalTensors": 0.030941486358642578,
68
+ "InferPSumTensor": 0.12924981117248535,
69
+ "InferShardAxis": 0.504509449005127,
70
+ "InferSharedMemLoc": 0.03389143943786621,
71
+ "InlineNativeKernels": 0.00193023681640625,
72
+ "InsertCoreBarrier": 0.019978046417236328,
73
+ "InsertIOTransposes": 0.061508893966674805,
74
+ "InsertImplicitShardAxisBeforeISel": 0.01612401008605957,
75
+ "InsertLocalTransposes": 0.005467414855957031,
76
+ "InsertOffloadedTransposes": 0.025030136108398438,
77
+ "LICM": 0.010097026824951172,
78
+ "LateLegalizeInst": 0.033937692642211914,
79
+ "LateLegalizePostSplit": 0.020189762115478516,
80
+ "LateLowerReshapeOp": 0.0018696784973144531,
81
+ "LateLowerTensorOp": 0.0022716522216796875,
82
+ "LateNeuronInstComb": 0.060944557189941406,
83
+ "LayoutPreprocessing": 0.05716848373413086,
84
+ "LayoutPreprocessingAndAnalysis": 0.12559008598327637,
85
+ "LayoutRequirementAnalysis": 0.01263284683227539,
86
+ "LegalizeCCOpLayout": 0.003709077835083008,
87
+ "LegalizeOpLevelAlias": 0.0016541481018066406,
88
+ "LegalizePartitionReduce": 0.007805347442626953,
89
+ "LegalizeSundaAccess": 0.13506388664245605,
90
+ "LegalizeSundaMacro": 0.020558595657348633,
91
+ "LegalizeType": 0.04366302490234375,
92
+ "LocalLayoutOpt": 0.04371356964111328,
93
+ "LoopFusion": 0.03305792808532715,
94
+ "LoopSplitting": 0.0017974376678466797,
95
+ "LowerBroadcast": 0.015467643737792969,
96
+ "LowerCCOpBlockAxis": 0.013673782348632813,
97
+ "LowerComplexBroadcast": 0.005238771438598633,
98
+ "LowerIntrinsics": 0.059927940368652344,
99
+ "LowerShardAxis": 0.02148151397705078,
100
+ "LowerTensorOp": 0.011847496032714844,
101
+ "LowerToSendRecv": 0.03099536895751953,
102
+ "LowerTranspose": 0.026517152786254883,
103
+ "MacroGeneration": 0.11886835098266602,
104
+ "MaskPropagation": 0.01356053352355957,
105
+ "MemcastMotion": 1.799999881768599e-05,
106
+ "MemcpyElimination": 0.050164222717285156,
107
+ "MutateDataType": 0.0028362274169921875,
108
+ "NeuronAliasDependencyInduction": 0.0024106502532958984,
109
+ "NeuronAliasDependencyReset": 0.07959818840026855,
110
+ "NeuronInstComb": 0.05623912811279297,
111
+ "NeuronLICM": 0.06090664863586426,
112
+ "NeuronLoopFusion": 0.0700373649597168,
113
+ "NeuronLoopInterchange": 0.003496885299682617,
114
+ "NeuronSimplifier": 0.0175168514251709,
115
+ "NeuronSimplifyPredicates": 0.035622596740722656,
116
+ "NeuronValueNumbering": 0.02324056625366211,
117
+ "OptimizeAliasedCopyChain": 0.0008881092071533203,
118
+ "OptimizeNKIKernels": 4.497897148132324,
119
+ "PAGLayoutOpt": 0.11170005798339844,
120
+ "PComputeCutting": 0.02699899673461914,
121
+ "PGLayoutTilingPipeline": 1.7730352878570557,
122
+ "PGTiling": 0.4928562641143799,
123
+ "PadElimination": 0.0005004405975341797,
124
+ "ParAxesAnnotation": 0.08141517639160156,
125
+ "PartialLoopFusion": 0.05184769630432129,
126
+ "PartialSimdFusion": 0.019034385681152344,
127
+ "PenguinizeFunctions": 3.7000001611886546e-05,
128
+ "PerfectLoopNest": 0.005218982696533203,
129
+ "PruneFunctions": 3.7999998312443495e-05,
130
+ "RecognizeOpIdiom": 0.028120994567871094,
131
+ "Recompute": 0.0006320476531982422,
132
+ "RelaxPredicates": 0.012555122375488281,
133
+ "Rematerialization": 0.002846240997314453,
134
+ "RemoveOptimizationBarriers": 8.199999865610152e-05,
135
+ "RemoveShardedPartitionAxes": 0.028553009033203125,
136
+ "ReshapeWeights": 0.0013833045959472656,
137
+ "ResolveAccessConflict": 0.007452726364135742,
138
+ "ResolveComplicatePredicates": 0.002027273178100586,
139
+ "RewriteReplicationMatmul": 0.0019905567169189453,
140
+ "RewriteWeights": 0.005997419357299805,
141
+ "SFKVectorizer": 0.2772505283355713,
142
+ "ScatterMotion": 2.300000051036477e-05,
143
+ "ShardingPropagationAnalysis": 0.11750531196594238,
144
+ "SimpleAllReduceTiling": 0.02184891700744629,
145
+ "Simplifier": 0.01620769500732422,
146
+ "SimplifyMacroPredicates": 0.03200030326843262,
147
+ "SimplifyNeuronTensor": 0.09968447685241699,
148
+ "SimplifySlice": 0.002093076705932617,
149
+ "SimplifyTensor": 0.01188349723815918,
150
+ "SpillPSum": 0.06837248802185059,
151
+ "SplitAPUnionSets": 0.09830927848815918,
152
+ "SplitAccGrp": 0.003184795379638672,
153
+ "StaticProfiler": 0.024499177932739258,
154
+ "StaticTransposeLocalTensor": 0.013921499252319336,
155
+ "SundaISel": 0.12911200523376465,
156
+ "TCTransform": 0.01076197624206543,
157
+ "TensorInitialization": 0.015585660934448242,
158
+ "TensorOpSimplifier": 0.009182214736938477,
159
+ "TensorOpTransform": 0.02479076385498047,
160
+ "TensorizerLegalizationPass": 4.5000000682193786e-05,
161
+ "TileCCOps": 0.01529073715209961,
162
+ "TilingProfiler": 0.02448558807373047,
163
+ "TransformConvOp": 0.0032668113708496094,
164
+ "TritiumFusion": 0.07947993278503418,
165
+ "ValueNumbering": 0.008611917495727539,
166
+ "VectorizeDMA": 0.008882284164428711,
167
+ "VectorizeMatMult": 0.013601303100585938,
168
+ "VerifySupportedOps": 3.199999991920777e-05,
169
+ "WeightCoalescing": 0.014402627944946289,
170
+ "ZeroSizeTensorElimination": 0.00017452239990234375,
171
+ "algsimp": 0.001744000008329749,
172
+ "batchnorm_expander": 3.5000000934815034e-05,
173
+ "boundary-marker-removal": 1.1000000085914508e-05,
174
+ "call-inliner": 0.00022499999613501132,
175
+ "canonicalize-boundary-marker": 1.2999999853491317e-05,
176
+ "collective-stream-id-checker": 6.0999998822808266e-05,
177
+ "comparison-expander": 0.0004409999819472432,
178
+ "computation-deduplicator": 5.299999611452222e-05,
179
+ "config-lowering": 9.800000407267362e-05,
180
+ "constant-statistics": 0.0003980000037699938,
181
+ "constant_folding": 0.00015499998698942363,
182
+ "cse": 3.199999991920777e-05,
183
+ "dce": 4.099999932805076e-05,
184
+ "dot_decomposer": 0.0008870000019669533,
185
+ "dynamic-slice-transpose": 1.2000000424450263e-05,
186
+ "eliminate-redundant-compare": 0.0001379999885102734,
187
+ "emit-offloaded-dropout": 3.400000059627928e-05,
188
+ "flatten-call-graph": 0.0006670000148005784,
189
+ "fuse-send-recv": 5.299999611452222e-05,
190
+ "hilo-conditional-to-select": 1.2000000424450263e-05,
191
+ "hilo::LegalizeAlias": 1.1999999514955562e-05,
192
+ "hilo::NeuronInstCombine": 0.00015300000086426735,
193
+ "hilo::NeuronOpFusion": 2.9999999242136255e-05,
194
+ "hilo::ReplaceTokenTypeWithU8Pass": 3.099999958067201e-05,
195
+ "hilo::ScheduleFusion": 5.999999757477781e-06,
196
+ "hilo::SixtyFourHack": 6.500000017695129e-05,
197
+ "hilo::VerifyAliasing": 4.999999873689376e-06,
198
+ "hlo-mac-count": 0.01228100061416626,
199
+ "instruction-histogram": 0.0007319999858736992,
200
+ "io-con-pipe-begin": 4.999999873689376e-06,
201
+ "io-con-pipe-end": 9.999999974752427e-07,
202
+ "io-layout-normalization": 0.0008159999852068722,
203
+ "io-statistics": 3.899999865097925e-05,
204
+ "legalize-ccops-for-tensorizer": 3.000000106112566e-06,
205
+ "legalize-compare": 1.1000000085914508e-05,
206
+ "lower-argminmax-custom-call": 9.000000318337698e-06,
207
+ "map-inline": 0.0007249999907799065,
208
+ "metadata-naming": 4.400000034365803e-05,
209
+ "mlir::detail::OpToOpPassAdaptor": 6.800000119255856e-05,
210
+ "mlir::hlo::MhloToPyPenguin": 0.008609999902546406,
211
+ "mlir::mhlo::LowerComplexExtraPass": 0.000291000003926456,
212
+ "mlir::mhlo::LowerComplexPass": 0.0005230000242590904,
213
+ "native-to-custom-softmax": 0.0003209999995306134,
214
+ "native-to-custom-softmax-dx": 0.0004980000085197389,
215
+ "neuron-hlo-verifier": 0.010431000031530857,
216
+ "operand_upcaster": 4.400000034365803e-05,
217
+ "opt-barrier-removal": 0.0002589999930933118,
218
+ "post-par-pipe-begin": 5.999999757477781e-06,
219
+ "post-par-pipe-end": 0.0,
220
+ "post-partition-simplification": 0.0013230000622570515,
221
+ "pre-par-pipe-begin": 9.999999974752427e-07,
222
+ "pre-par-pipe-end": 0.0,
223
+ "pre-partition-simplification": 0.06850799918174744,
224
+ "replace-minimum-constant": 0.00036299999919719994,
225
+ "reshape-mover": 5.500000042957254e-05,
226
+ "simplify-concat": 0.00010000000474974513,
227
+ "simplify-while-loops": 5.0000002374872565e-05,
228
+ "transform-variadic-reduce": 5.8999998145736754e-05,
229
+ "tuple-simplifier": 0.00014600000577047467,
230
+ "unpack-nested-aws-ntwsr": 0.0002479999966453761,
231
+ "unroll-while-loop": 7.999999979801942e-06,
232
+ "zero_sized_hlo_elimination": 0.0007040000054985285
233
+ },
234
+ "hilo": {
235
+ "ConstantSize": 926335.0,
236
+ "HloInputCount": 371.0,
237
+ "HloMacCount": 26463305728.0,
238
+ "HloOutputCount": 57.0,
239
+ "IfmapSize": 3910916096.0,
240
+ "OfmapSize": 1879048192.0,
241
+ "OutputsReadFromCount": 0.0,
242
+ "PassthroughTensorsCount": 0.0,
243
+ "RedundantOutputCount": 0.0,
244
+ "Traffic": 886427776.0
245
+ },
246
+ "tensorizer": {
247
+ "DMATilingProfiler::TotalInstructionsAfterTiling": 22051.0,
248
+ "StaticProfiler::AifUb": 173.52798461914063,
249
+ "StaticProfiler::ArithmeticIntensityTensorizer": 150.2424774169922,
250
+ "StaticProfiler::AverageDmaLength": 2589.193359375,
251
+ "StaticProfiler::DDRTransferBytes": 407886880.0,
252
+ "StaticProfiler::InternalTransferBytes": 327079712.0,
253
+ "StaticProfiler::LoadExpanded": 89436.0,
254
+ "StaticProfiler::StoreExpanded": 2154.0,
255
+ "StaticProfiler::TotalDMAExpanded": 91590.0,
256
+ "StaticProfiler::TotalDynamicInstancesCount": 26447.0,
257
+ "StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 25996.0,
258
+ "StaticProfiler::TotalLNCComm": 0.0,
259
+ "StaticProfiler::TotalLNCCommTransfer": 0.0,
260
+ "TilingProfiler::BatchnormInstructionsAfterTiling": 0.0,
261
+ "TilingProfiler::DmaInstructionsAfterTiling": 0.0,
262
+ "TilingProfiler::GenericInstructionsAfterTiling": 4.0,
263
+ "TilingProfiler::MatMultInstructionsAfterTiling": 11424.0,
264
+ "TilingProfiler::NumPfTransposes": 6.0,
265
+ "TilingProfiler::NumPfTransposesForIo": 1.0,
266
+ "TilingProfiler::NumPfTransposesForLocal": 1.0,
267
+ "TilingProfiler::NumPfTransposesForNonlocal": 4.0,
268
+ "TilingProfiler::PfTransposeInstructions": 10291.0,
269
+ "TilingProfiler::PfTransposeInstructionsForIo": 9504.0,
270
+ "TilingProfiler::PfTransposeInstructionsForLocal": 1.0,
271
+ "TilingProfiler::PfTransposeInstructionsForNonlocal": 786.0,
272
+ "TilingProfiler::ReduceInstructionsAfterTiling": 4.0,
273
+ "TilingProfiler::SimdInstructionsAfterTiling": 164.0,
274
+ "TilingProfiler::TotalInstructionsAfterTiling": 0.0,
275
+ "TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0,
276
+ "TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0,
277
+ "TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0,
278
+ "TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0,
279
+ "TransformConvOp::conv2d_column_packing": 0.0,
280
+ "TransformConvOp::conv2d_column_packing_1": 0.0,
281
+ "TransformConvOp::conv2d_column_packing_io10": 0.0,
282
+ "TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0
283
+ }
284
+ },
285
+ "all": {
286
+ "compiletime": {
287
+ "algsimp": 0.0016029999824240804,
288
+ "call-inliner": 0.00019999999494757503,
289
+ "collective-stream-id-checker": 5.2999999752501026e-05,
290
+ "comparison-expander": 0.00042699999175965786,
291
+ "constant-statistics": 0.0003980000037699938,
292
+ "constant_folding": 0.0001340000017080456,
293
+ "dce": 3.7999998312443495e-05,
294
+ "dot_decomposer": 0.0008870000019669533,
295
+ "eliminate-redundant-compare": 0.0001289999927394092,
296
+ "flatten-call-graph": 0.0006440000142902136,
297
+ "hlo-mac-count": 0.007197000086307526,
298
+ "instruction-histogram": 0.0007319999858736992,
299
+ "io-con-pipe-begin": 4.999999873689376e-06,
300
+ "io-con-pipe-end": 9.999999974752427e-07,
301
+ "io-layout-normalization": 0.0008159999852068722,
302
+ "io-statistics": 3.899999865097925e-05,
303
+ "map-inline": 0.0006960000027902424,
304
+ "native-to-custom-softmax": 0.00030499999411404133,
305
+ "native-to-custom-softmax-dx": 0.00039000000106170774,
306
+ "neuron-hlo-verifier": 0.009362000040709972,
307
+ "opt-barrier-removal": 0.0002589999930933118,
308
+ "pre-par-pipe-begin": 9.999999974752427e-07,
309
+ "pre-par-pipe-end": 0.0,
310
+ "pre-partition-simplification": 0.06850799918174744,
311
+ "replace-minimum-constant": 0.00034500000765547156,
312
+ "reshape-mover": 4.8999998398358e-05,
313
+ "simplify-while-loops": 4.400000034365803e-05,
314
+ "tuple-simplifier": 0.0001340000017080456,
315
+ "unpack-nested-aws-ntwsr": 0.00023799999326001853,
316
+ "unroll-while-loop": 7.999999979801942e-06,
317
+ "zero_sized_hlo_elimination": 0.0007040000054985285
318
+ }
319
+ },
320
+ "attention_isa_kernel": {
321
+ "compiletime": {
322
+ "CoalesceCCOp": 0.00021982192993164063,
323
+ "DMALocalityOpt": 0.00021767616271972656,
324
+ "DMAProfiler": 0.0002532005310058594,
325
+ "DataStreaming": 0.00019359588623046875,
326
+ "DoNothing": 0.00017213821411132813,
327
+ "ExpandISAMacro": 0.00021219253540039063,
328
+ "FactorizeBlkDims": 0.0016205310821533203,
329
+ "InferPSumTensor": 0.00067901611328125,
330
+ "InferSharedMemLoc": 0.0005524158477783203,
331
+ "InsertCoreBarrier": 0.00033855438232421875,
332
+ "LateLegalizeInst": 0.00021457672119140625,
333
+ "LateNeuronInstComb": 0.00042700767517089844,
334
+ "LegalizeSundaAccess": 0.00022602081298828125,
335
+ "LegalizeType": 0.00026869773864746094,
336
+ "LowerBroadcast": 0.0002257823944091797,
337
+ "LowerIntrinsics": 0.0002770423889160156,
338
+ "LowerTranspose": 0.0002372264862060547,
339
+ "NeuronInstComb": 0.0004298686981201172,
340
+ "NeuronLICM": 0.00019097328186035156,
341
+ "NeuronSimplifyPredicates": 0.00029349327087402344,
342
+ "NeuronValueNumbering": 0.00023818016052246094,
343
+ "SFKVectorizer": 0.0022597312927246094,
344
+ "SimpleAllReduceTiling": 0.00019431114196777344,
345
+ "SimplifyNeuronTensor": 0.0004868507385253906,
346
+ "SpillPSum": 0.0006351470947265625,
347
+ "WeightCoalescing": 0.00022172927856445313
348
+ }
349
+ },
350
+ "cumsum": {
351
+ "compiletime": {
352
+ "CoalesceCCOp": 0.0003490447998046875,
353
+ "DMALocalityOpt": 0.00027871131896972656,
354
+ "DMAProfiler": 0.0013451576232910156,
355
+ "DataStreaming": 0.00047016143798828125,
356
+ "DoNothing": 0.0002353191375732422,
357
+ "ExpandISAMacro": 0.0008096694946289063,
358
+ "FactorizeBlkDims": 0.0007121562957763672,
359
+ "InferPSumTensor": 0.0026960372924804688,
360
+ "InferSharedMemLoc": 0.0007166862487792969,
361
+ "InsertCoreBarrier": 0.0004069805145263672,
362
+ "LateLegalizeInst": 0.0005886554718017578,
363
+ "LateNeuronInstComb": 0.002978801727294922,
364
+ "LegalizeSundaAccess": 0.003289461135864258,
365
+ "LegalizeType": 0.00041961669921875,
366
+ "LowerBroadcast": 0.0004119873046875,
367
+ "LowerIntrinsics": 0.0003657341003417969,
368
+ "LowerTranspose": 0.0004086494445800781,
369
+ "NeuronInstComb": 0.0012252330780029297,
370
+ "NeuronLICM": 0.0016541481018066406,
371
+ "NeuronSimplifyPredicates": 0.003880739212036133,
372
+ "NeuronValueNumbering": 0.0015976428985595703,
373
+ "SFKVectorizer": 0.005974292755126953,
374
+ "SimpleAllReduceTiling": 0.0007178783416748047,
375
+ "SimplifyNeuronTensor": 0.001119852066040039,
376
+ "SpillPSum": 0.003050565719604492,
377
+ "WeightCoalescing": 0.004181385040283203
378
+ }
379
+ },
380
+ "sg00": {
381
+ "compiletime": {
382
+ "CanonicalizeConv": 1.1000000085914508e-05,
383
+ "CanonicalizeForTensorizer": 1.4000000192027073e-05,
384
+ "Canonicalizer": 0.00028899998869746923,
385
+ "HoistCompute": 1.9999999949504854e-06,
386
+ "IdentifyCrossPassTensors": 1.5999999959603883e-05,
387
+ "MemcastMotion": 9.999999747378752e-06,
388
+ "PenguinizeFunctions": 1.4000000192027073e-05,
389
+ "PruneFunctions": 1.4000000192027073e-05,
390
+ "RemoveOptimizationBarriers": 2.099999983329326e-05,
391
+ "ScatterMotion": 9.000000318337698e-06,
392
+ "TensorizerLegalizationPass": 2.2000000171829015e-05,
393
+ "VerifySupportedOps": 9.999999747378752e-06,
394
+ "algsimp": 4.8000001697801054e-05,
395
+ "batchnorm_expander": 1.2000000424450263e-05,
396
+ "boundary-marker-removal": 3.999999989900971e-06,
397
+ "call-inliner": 7.000000096013537e-06,
398
+ "canonicalize-boundary-marker": 3.999999989900971e-06,
399
+ "collective-stream-id-checker": 1.9999999949504854e-06,
400
+ "comparison-expander": 3.999999989900971e-06,
401
+ "computation-deduplicator": 1.4999999621068127e-05,
402
+ "config-lowering": 3.400000059627928e-05,
403
+ "constant_folding": 7.000000096013537e-06,
404
+ "cse": 9.999999747378752e-06,
405
+ "dce": 9.999999974752427e-07,
406
+ "dynamic-slice-transpose": 3.999999989900971e-06,
407
+ "eliminate-redundant-compare": 3.000000106112566e-06,
408
+ "emit-offloaded-dropout": 1.2000000424450263e-05,
409
+ "flatten-call-graph": 7.000000096013537e-06,
410
+ "fuse-send-recv": 1.700000029813964e-05,
411
+ "hilo-conditional-to-select": 3.000000106112566e-06,
412
+ "hilo::LegalizeAlias": 4.999999873689376e-06,
413
+ "hilo::NeuronInstCombine": 5.700000110664405e-05,
414
+ "hilo::NeuronOpFusion": 1.4000000192027073e-05,
415
+ "hilo::ReplaceTokenTypeWithU8Pass": 1.2000000424450263e-05,
416
+ "hilo::ScheduleFusion": 9.999999974752427e-07,
417
+ "hilo::SixtyFourHack": 1.2000000424450263e-05,
418
+ "hilo::VerifyAliasing": 1.9999999949504854e-06,
419
+ "hlo-mac-count": 8.499999967170879e-05,
420
+ "legalize-ccops-for-tensorizer": 9.999999974752427e-07,
421
+ "legalize-compare": 3.000000106112566e-06,
422
+ "lower-argminmax-custom-call": 3.000000106112566e-06,
423
+ "map-inline": 9.000000318337698e-06,
424
+ "metadata-naming": 1.2000000424450263e-05,
425
+ "mlir::detail::OpToOpPassAdaptor": 2.300000051036477e-05,
426
+ "mlir::hlo::MhloToPyPenguin": 0.0016840000171214342,
427
+ "mlir::mhlo::LowerComplexExtraPass": 7.699999696342275e-05,
428
+ "mlir::mhlo::LowerComplexPass": 0.0001720000000204891,
429
+ "native-to-custom-softmax": 4.999999873689376e-06,
430
+ "native-to-custom-softmax-dx": 7.200000254670158e-05,
431
+ "neuron-hlo-verifier": 0.000371000001905486,
432
+ "operand_upcaster": 1.4000000192027073e-05,
433
+ "post-par-pipe-begin": 9.999999974752427e-07,
434
+ "post-par-pipe-end": 0.0,
435
+ "post-partition-simplification": 0.00043399998685345054,
436
+ "replace-minimum-constant": 6.000000212225132e-06,
437
+ "reshape-mover": 1.9999999949504854e-06,
438
+ "simplify-concat": 3.300000025774352e-05,
439
+ "simplify-while-loops": 1.9999999949504854e-06,
440
+ "transform-variadic-reduce": 7.000000096013537e-06,
441
+ "tuple-simplifier": 3.999999989900971e-06,
442
+ "unpack-nested-aws-ntwsr": 3.000000106112566e-06,
443
+ "unroll-while-loop": 0.0
444
+ },
445
+ "hilo": {
446
+ "ArithmeticIntensity": 17.4229793548584,
447
+ "ConstantSize": 926335.0,
448
+ "HloInputCount": 371.0,
449
+ "HloMacCount": 3489660928.0,
450
+ "HloOutputCount": 57.0,
451
+ "IfmapSize": 3910916096.0,
452
+ "OfmapSize": 1879048192.0,
453
+ "OutputsReadFromCount": 0.0,
454
+ "PassthroughTensorsCount": 0.0,
455
+ "RedundantOutputCount": 0.0,
456
+ "Traffic": 400581408.0
457
+ }
458
+ },
459
+ "sg0000": {
460
+ "compiletime": {
461
+ "AGOrderingAnalysisPass": 0.07508444786071777,
462
+ "AffinePredicateResolution": 0.0010340213775634766,
463
+ "AliasDependencyElimination": 0.0002384185791015625,
464
+ "AliasDependencyInduction": 0.007371425628662109,
465
+ "AliasDependencyReset": 0.0582888126373291,
466
+ "BFComputeCutting": 0.013819217681884766,
467
+ "BirCodeGenLoop": 0.06449317932128906,
468
+ "CCOpFusion": 0.04928326606750488,
469
+ "CanonicalizeDAGForPGTiling": 0.0076160430908203125,
470
+ "CanonicalizeIR": 0.0027213096618652344,
471
+ "CoalesceCCOp": 0.007978439331054688,
472
+ "CommuteConcat": 0.002101421356201172,
473
+ "DMALocalityOpt": 0.005911350250244141,
474
+ "DMAProfiler": 0.011723995208740234,
475
+ "DMATilingProfiler": 0.0077321529388427734,
476
+ "DataLocalityOpt": 0.20074963569641113,
477
+ "DataStreaming": 0.012155294418334961,
478
+ "DeConcat": 0.00474858283996582,
479
+ "DeadCodeElimination": 0.002126932144165039,
480
+ "DeadStoreElimination": 0.044701576232910156,
481
+ "DelinearIndices": 0.019860267639160156,
482
+ "Delinearization": 0.006117343902587891,
483
+ "DelinearizeSPMD": 0.04185628890991211,
484
+ "DoNothing": 9.918212890625e-05,
485
+ "DramToDramTranspose": 0.017105817794799805,
486
+ "DumpGraphAndMetadata": 0.0168914794921875,
487
+ "EliminateDivs": 0.0026845932006835938,
488
+ "ExpandBatchNorm": 0.0020225048065185547,
489
+ "ExpandISAMacro": 0.007347822189331055,
490
+ "FactorizeBlkDims": 0.05445575714111328,
491
+ "FactorizeThreadAxesInFreeDims": 0.004782199859619141,
492
+ "FlattenMacroLoop": 0.012040138244628906,
493
+ "GenericAccessSimplifier": 0.001428365707397461,
494
+ "InferInitValue": 0.08275437355041504,
495
+ "InferIntrinsicOnCC": 0.016964197158813477,
496
+ "InferNeuronTensor": 0.0713052749633789,
497
+ "InferNonlocalTensors": 0.17369747161865234,
498
+ "InferPSumTensor": 0.07679295539855957,
499
+ "InferShardAxis": 0.5430936813354492,
500
+ "InferSharedMemLoc": 0.0051038265228271484,
501
+ "InlineNativeKernels": 0.005239963531494141,
502
+ "InsertCoreBarrier": 0.008324384689331055,
503
+ "InsertIOTransposes": 0.038658857345581055,
504
+ "InsertImplicitShardAxisBeforeISel": 0.009135007858276367,
505
+ "InsertLocalTransposes": 0.029627084732055664,
506
+ "InsertOffloadedTransposes": 0.019885540008544922,
507
+ "LICM": 0.0056383609771728516,
508
+ "LateLegalizeInst": 0.011803150177001953,
509
+ "LateLegalizePostSplit": 0.005868196487426758,
510
+ "LateLowerReshapeOp": 0.007382631301879883,
511
+ "LateLowerTensorOp": 0.004155397415161133,
512
+ "LateNeuronInstComb": 0.0334017276763916,
513
+ "LayoutPreprocessing": 0.25243687629699707,
514
+ "LayoutPreprocessingAndAnalysis": 0.30139756202697754,
515
+ "LayoutRequirementAnalysis": 0.014056921005249023,
516
+ "LegalizeCCOpLayout": 0.0020928382873535156,
517
+ "LegalizeOpLevelAlias": 0.0016238689422607422,
518
+ "LegalizePartitionReduce": 0.0030252933502197266,
519
+ "LegalizeSundaAccess": 0.05711483955383301,
520
+ "LegalizeSundaMacro": 0.023845911026000977,
521
+ "LegalizeType": 0.00843501091003418,
522
+ "LocalLayoutOpt": 0.11445784568786621,
523
+ "LoopFusion": 0.01024007797241211,
524
+ "LoopSplitting": 0.0017781257629394531,
525
+ "LowerBroadcast": 0.0037119388580322266,
526
+ "LowerCCOpBlockAxis": 0.014172077178955078,
527
+ "LowerComplexBroadcast": 0.004027366638183594,
528
+ "LowerIntrinsics": 0.03793048858642578,
529
+ "LowerShardAxis": 0.012651443481445313,
530
+ "LowerTensorOp": 0.01001119613647461,
531
+ "LowerToSendRecv": 0.005930900573730469,
532
+ "LowerTranspose": 0.018492937088012695,
533
+ "MacroGeneration": 0.11934685707092285,
534
+ "MaskPropagation": 0.005895137786865234,
535
+ "MemcpyElimination": 0.09257030487060547,
536
+ "MutateDataType": 0.0017631053924560547,
537
+ "NeuronAliasDependencyInduction": 0.0007777214050292969,
538
+ "NeuronAliasDependencyReset": 0.03222823143005371,
539
+ "NeuronInstComb": 0.02764892578125,
540
+ "NeuronLICM": 0.015506982803344727,
541
+ "NeuronLoopFusion": 0.0383763313293457,
542
+ "NeuronLoopInterchange": 0.010429620742797852,
543
+ "NeuronSimplifier": 0.033356666564941406,
544
+ "NeuronSimplifyPredicates": 0.006680965423583984,
545
+ "NeuronValueNumbering": 0.019241809844970703,
546
+ "OptimizeAliasedCopyChain": 0.0010235309600830078,
547
+ "OptimizeNKIKernels": 0.45916128158569336,
548
+ "PAGLayoutOpt": 0.7117609977722168,
549
+ "PComputeCutting": 0.020105838775634766,
550
+ "PGLayoutTilingPipeline": 2.928948163986206,
551
+ "PGTiling": 0.39027953147888184,
552
+ "PadElimination": 0.0007317066192626953,
553
+ "ParAxesAnnotation": 0.6492185592651367,
554
+ "PartialLoopFusion": 0.0445561408996582,
555
+ "PartialSimdFusion": 0.039563655853271484,
556
+ "PerfectLoopNest": 0.0034646987915039063,
557
+ "RecognizeOpIdiom": 0.016507387161254883,
558
+ "Recompute": 0.0003933906555175781,
559
+ "RelaxPredicates": 0.005345582962036133,
560
+ "Rematerialization": 0.005880117416381836,
561
+ "RemoveShardedPartitionAxes": 0.03753328323364258,
562
+ "ReshapeWeights": 0.002991914749145508,
563
+ "ResolveAccessConflict": 0.0245821475982666,
564
+ "ResolveComplicatePredicates": 0.0018818378448486328,
565
+ "RewriteReplicationMatmul": 0.0024051666259765625,
566
+ "RewriteWeights": 0.006072998046875,
567
+ "SFKVectorizer": 0.49936652183532715,
568
+ "ShardingPropagationAnalysis": 0.03256559371948242,
569
+ "SimpleAllReduceTiling": 0.0036296844482421875,
570
+ "Simplifier": 0.007125377655029297,
571
+ "SimplifyMacroPredicates": 0.02839207649230957,
572
+ "SimplifyNeuronTensor": 0.021625995635986328,
573
+ "SimplifySlice": 0.0024862289428710938,
574
+ "SimplifyTensor": 0.033231496810913086,
575
+ "SpillPSum": 0.034162282943725586,
576
+ "SplitAPUnionSets": 0.042994022369384766,
577
+ "SplitAccGrp": 0.00764918327331543,
578
+ "StaticProfiler": 0.008186817169189453,
579
+ "StaticTransposeLocalTensor": 0.007767438888549805,
580
+ "SundaISel": 0.05960273742675781,
581
+ "TCTransform": 0.00103759765625,
582
+ "TensorInitialization": 0.007684469223022461,
583
+ "TensorOpSimplifier": 0.006952047348022461,
584
+ "TensorOpTransform": 0.030390501022338867,
585
+ "TileCCOps": 0.006802797317504883,
586
+ "TilingProfiler": 0.040956735610961914,
587
+ "TransformConvOp": 0.0029840469360351563,
588
+ "TritiumFusion": 0.03676962852478027,
589
+ "ValueNumbering": 0.0034532546997070313,
590
+ "VectorizeDMA": 0.005709171295166016,
591
+ "VectorizeMatMult": 0.030527591705322266,
592
+ "WeightCoalescing": 0.0040700435638427734,
593
+ "ZeroSizeTensorElimination": 0.0002455711364746094
594
+ },
595
+ "tensorizer": {
596
+ "DMATilingProfiler::TotalInstructionsAfterTiling": 1174.0,
597
+ "StaticProfiler::AifUb": 16.874553680419922,
598
+ "StaticProfiler::ArithmeticIntensityTensorizer": 204.6156768798828,
599
+ "StaticProfiler::AverageDmaLength": 1413.5869140625,
600
+ "StaticProfiler::AverageFractalPeUtilization": 99.77033233642578,
601
+ "StaticProfiler::AveragePartitionUtilization": 99.01372528076172,
602
+ "StaticProfiler::AveragePeUtilization": 99.29181671142578,
603
+ "StaticProfiler::DDRTransferBytes": 38148616.0,
604
+ "StaticProfiler::InternalTransferBytes": 22941696.0,
605
+ "StaticProfiler::LoadExpanded": 12553.0,
606
+ "StaticProfiler::LocalizationEfficiency": 1212.5694580078125,
607
+ "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 1809.3712158203125,
608
+ "StaticProfiler::StoreExpanded": 8193.0,
609
+ "StaticProfiler::TotalDMAExpanded": 20746.0,
610
+ "StaticProfiler::TotalDynamicInstancesCount": 1510.0,
611
+ "StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 1506.0,
612
+ "StaticProfiler::TotalLNCComm": 0.0,
613
+ "StaticProfiler::TotalLNCCommTransfer": 0.0,
614
+ "TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0,
615
+ "TilingProfiler::AveragePeUtilizationAfterTiling": 0.0,
616
+ "TilingProfiler::BatchnormInstructionsAfterTiling": 0.0,
617
+ "TilingProfiler::DmaInstructionsAfterTiling": 0.0,
618
+ "TilingProfiler::GenericInstructionsAfterTiling": 40.0,
619
+ "TilingProfiler::MatMultInstructionsAfterTiling": 644.0,
620
+ "TilingProfiler::NumPfTransposes": 7.0,
621
+ "TilingProfiler::NumPfTransposesForIo": 2.0,
622
+ "TilingProfiler::NumPfTransposesForLocal": 2.0,
623
+ "TilingProfiler::NumPfTransposesForNonlocal": 3.0,
624
+ "TilingProfiler::PfTransposeInstructions": 209.0,
625
+ "TilingProfiler::PfTransposeInstructionsForIo": 65.0,
626
+ "TilingProfiler::PfTransposeInstructionsForLocal": 48.0,
627
+ "TilingProfiler::PfTransposeInstructionsForNonlocal": 96.0,
628
+ "TilingProfiler::ReduceInstructionsAfterTiling": 0.0,
629
+ "TilingProfiler::SimdInstructionsAfterTiling": 136.0,
630
+ "TilingProfiler::TotalInstructionsAfterTiling": 0.0,
631
+ "TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0,
632
+ "TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0,
633
+ "TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0,
634
+ "TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0,
635
+ "TransformConvOp::conv2d_column_packing": 0.0,
636
+ "TransformConvOp::conv2d_column_packing_1": 0.0,
637
+ "TransformConvOp::conv2d_column_packing_io10": 0.0,
638
+ "TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0
639
+ }
640
+ },
641
+ "sg0001": {
642
+ "compiletime": {
643
+ "AGOrderingAnalysisPass": 0.08034706115722656,
644
+ "AffinePredicateResolution": 0.0021657943725585938,
645
+ "AliasDependencyElimination": 0.0002224445343017578,
646
+ "AliasDependencyInduction": 0.006604909896850586,
647
+ "AliasDependencyReset": 0.028621673583984375,
648
+ "BFComputeCutting": 0.006361484527587891,
649
+ "BirCodeGenLoop": 0.043970584869384766,
650
+ "CCOpFusion": 0.03917193412780762,
651
+ "CanonicalizeDAGForPGTiling": 0.015412569046020508,
652
+ "CanonicalizeIR": 0.0026285648345947266,
653
+ "CoalesceCCOp": 0.019171714782714844,
654
+ "CommuteConcat": 0.0022630691528320313,
655
+ "DMALocalityOpt": 0.0018835067749023438,
656
+ "DMAProfiler": 0.015621662139892578,
657
+ "DMATilingProfiler": 0.007387399673461914,
658
+ "DataLocalityOpt": 0.3166489601135254,
659
+ "DataStreaming": 0.008202552795410156,
660
+ "DeConcat": 0.0027625560760498047,
661
+ "DeadCodeElimination": 0.008514642715454102,
662
+ "DeadStoreElimination": 0.02995467185974121,
663
+ "DelinearIndices": 0.020328283309936523,
664
+ "Delinearization": 0.008889198303222656,
665
+ "DelinearizeSPMD": 0.025659799575805664,
666
+ "DoNothing": 9.298324584960938e-05,
667
+ "DramToDramTranspose": 0.013378381729125977,
668
+ "DumpGraphAndMetadata": 0.011143684387207031,
669
+ "EliminateDivs": 0.006491422653198242,
670
+ "ExpandBatchNorm": 0.0015842914581298828,
671
+ "ExpandISAMacro": 0.014866113662719727,
672
+ "FactorizeBlkDims": 0.02399158477783203,
673
+ "FactorizeThreadAxesInFreeDims": 0.008170843124389648,
674
+ "FlattenMacroLoop": 0.013584375381469727,
675
+ "GenericAccessSimplifier": 0.0016484260559082031,
676
+ "InferInitValue": 0.09902763366699219,
677
+ "InferIntrinsicOnCC": 0.05336475372314453,
678
+ "InferNeuronTensor": 0.0689244270324707,
679
+ "InferNonlocalTensors": 0.0623164176940918,
680
+ "InferPSumTensor": 0.06397223472595215,
681
+ "InferShardAxis": 0.7081491947174072,
682
+ "InferSharedMemLoc": 0.008078813552856445,
683
+ "InlineNativeKernels": 0.002736806869506836,
684
+ "InsertCoreBarrier": 0.008532524108886719,
685
+ "InsertIOTransposes": 0.04539895057678223,
686
+ "InsertImplicitShardAxisBeforeISel": 0.011088132858276367,
687
+ "InsertLocalTransposes": 0.008382081985473633,
688
+ "InsertOffloadedTransposes": 0.009244203567504883,
689
+ "LICM": 0.0059854984283447266,
690
+ "LateLegalizeInst": 0.012192487716674805,
691
+ "LateLegalizePostSplit": 0.004922151565551758,
692
+ "LateLowerReshapeOp": 0.0050048828125,
693
+ "LateLowerTensorOp": 0.00384521484375,
694
+ "LateNeuronInstComb": 0.01603221893310547,
695
+ "LayoutPreprocessing": 0.083892822265625,
696
+ "LayoutPreprocessingAndAnalysis": 0.14038705825805664,
697
+ "LayoutRequirementAnalysis": 0.026170969009399414,
698
+ "LegalizeCCOpLayout": 0.0018677711486816406,
699
+ "LegalizeOpLevelAlias": 0.0019845962524414063,
700
+ "LegalizePartitionReduce": 0.002770662307739258,
701
+ "LegalizeSundaAccess": 0.02824854850769043,
702
+ "LegalizeSundaMacro": 0.025277376174926758,
703
+ "LegalizeType": 0.005255222320556641,
704
+ "LocalLayoutOpt": 0.1487877368927002,
705
+ "LoopFusion": 0.009909629821777344,
706
+ "LoopSplitting": 0.004529237747192383,
707
+ "LowerBroadcast": 0.0027620792388916016,
708
+ "LowerCCOpBlockAxis": 0.012650728225708008,
709
+ "LowerComplexBroadcast": 0.015005111694335938,
710
+ "LowerIntrinsics": 0.03992509841918945,
711
+ "LowerShardAxis": 0.01078486442565918,
712
+ "LowerTensorOp": 0.010359048843383789,
713
+ "LowerToSendRecv": 0.010585546493530273,
714
+ "LowerTranspose": 0.024251461029052734,
715
+ "MacroGeneration": 0.17415404319763184,
716
+ "MaskPropagation": 0.009861946105957031,
717
+ "MemcpyElimination": 0.08973836898803711,
718
+ "MutateDataType": 0.0023250579833984375,
719
+ "NeuronAliasDependencyInduction": 0.0036211013793945313,
720
+ "NeuronAliasDependencyReset": 0.03322243690490723,
721
+ "NeuronInstComb": 0.027010679244995117,
722
+ "NeuronLICM": 0.014135122299194336,
723
+ "NeuronLoopFusion": 0.0790092945098877,
724
+ "NeuronLoopInterchange": 0.006104946136474609,
725
+ "NeuronSimplifier": 0.02999567985534668,
726
+ "NeuronSimplifyPredicates": 0.0038328170776367188,
727
+ "NeuronValueNumbering": 0.016868114471435547,
728
+ "OptimizeAliasedCopyChain": 0.0012192726135253906,
729
+ "OptimizeNKIKernels": 0.4351818561553955,
730
+ "PAGLayoutOpt": 0.3483104705810547,
731
+ "PComputeCutting": 0.02324676513671875,
732
+ "PGLayoutTilingPipeline": 2.0860910415649414,
733
+ "PGTiling": 0.4031491279602051,
734
+ "PadElimination": 0.000728607177734375,
735
+ "ParAxesAnnotation": 0.30509090423583984,
736
+ "PartialLoopFusion": 0.06583142280578613,
737
+ "PartialSimdFusion": 0.1207880973815918,
738
+ "PerfectLoopNest": 0.010277032852172852,
739
+ "RecognizeOpIdiom": 0.004372358322143555,
740
+ "Recompute": 0.00031304359436035156,
741
+ "RelaxPredicates": 0.005488395690917969,
742
+ "Rematerialization": 0.0020155906677246094,
743
+ "RemoveShardedPartitionAxes": 0.026065587997436523,
744
+ "ReshapeWeights": 0.0033690929412841797,
745
+ "ResolveAccessConflict": 0.011795282363891602,
746
+ "ResolveComplicatePredicates": 0.005822658538818359,
747
+ "RewriteReplicationMatmul": 0.004129886627197266,
748
+ "RewriteWeights": 0.012514114379882813,
749
+ "SFKVectorizer": 0.3114356994628906,
750
+ "ShardingPropagationAnalysis": 0.03329586982727051,
751
+ "SimpleAllReduceTiling": 0.003468751907348633,
752
+ "Simplifier": 0.007978200912475586,
753
+ "SimplifyMacroPredicates": 0.01414942741394043,
754
+ "SimplifyNeuronTensor": 0.018707275390625,
755
+ "SimplifySlice": 0.0030634403228759766,
756
+ "SimplifyTensor": 0.028036117553710938,
757
+ "SpillPSum": 0.02836132049560547,
758
+ "SplitAPUnionSets": 0.028769254684448242,
759
+ "SplitAccGrp": 0.002518892288208008,
760
+ "StaticProfiler": 0.012613058090209961,
761
+ "StaticTransposeLocalTensor": 0.014979124069213867,
762
+ "SundaISel": 0.06619906425476074,
763
+ "TCTransform": 0.0018546581268310547,
764
+ "TensorInitialization": 0.0047528743743896484,
765
+ "TensorOpSimplifier": 0.006958484649658203,
766
+ "TensorOpTransform": 0.0394289493560791,
767
+ "TileCCOps": 0.03006148338317871,
768
+ "TilingProfiler": 0.020921945571899414,
769
+ "TransformConvOp": 0.0030717849731445313,
770
+ "TritiumFusion": 0.10711383819580078,
771
+ "ValueNumbering": 0.002644777297973633,
772
+ "VectorizeDMA": 0.009524345397949219,
773
+ "VectorizeMatMult": 0.04689669609069824,
774
+ "WeightCoalescing": 0.004178285598754883,
775
+ "ZeroSizeTensorElimination": 0.00014138221740722656
776
+ },
777
+ "tensorizer": {
778
+ "DMATilingProfiler::TotalInstructionsAfterTiling": 3307.0,
779
+ "StaticProfiler::AifUb": 142.25091552734375,
780
+ "StaticProfiler::ArithmeticIntensityTensorizer": 232.9062957763672,
781
+ "StaticProfiler::AverageDmaLength": 3958.823974609375,
782
+ "StaticProfiler::AverageFractalPeUtilization": 100.0,
783
+ "StaticProfiler::AveragePartitionUtilization": 99.65841674804688,
784
+ "StaticProfiler::AveragePeUtilization": 100.0,
785
+ "StaticProfiler::DDRTransferBytes": 118065160.0,
786
+ "StaticProfiler::InternalTransferBytes": 19660800.0,
787
+ "StaticProfiler::LoadExpanded": 17025.0,
788
+ "StaticProfiler::LocalizationEfficiency": 163.7292022705078,
789
+ "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 185.10040283203125,
790
+ "StaticProfiler::StoreExpanded": 7937.0,
791
+ "StaticProfiler::TotalDMAExpanded": 24962.0,
792
+ "StaticProfiler::TotalDynamicInstancesCount": 3517.0,
793
+ "StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 3517.0,
794
+ "StaticProfiler::TotalLNCComm": 0.0,
795
+ "StaticProfiler::TotalLNCCommTransfer": 0.0,
796
+ "TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0,
797
+ "TilingProfiler::AveragePeUtilizationAfterTiling": 0.0,
798
+ "TilingProfiler::BatchnormInstructionsAfterTiling": 0.0,
799
+ "TilingProfiler::DmaInstructionsAfterTiling": 0.0,
800
+ "TilingProfiler::GenericInstructionsAfterTiling": 32.0,
801
+ "TilingProfiler::MatMultInstructionsAfterTiling": 2560.0,
802
+ "TilingProfiler::NumPfTransposes": 7.0,
803
+ "TilingProfiler::NumPfTransposesForIo": 3.0,
804
+ "TilingProfiler::NumPfTransposesForLocal": 2.0,
805
+ "TilingProfiler::NumPfTransposesForNonlocal": 2.0,
806
+ "TilingProfiler::PfTransposeInstructions": 232.0,
807
+ "TilingProfiler::PfTransposeInstructionsForIo": 72.0,
808
+ "TilingProfiler::PfTransposeInstructionsForLocal": 32.0,
809
+ "TilingProfiler::PfTransposeInstructionsForNonlocal": 128.0,
810
+ "TilingProfiler::ReduceInstructionsAfterTiling": 0.0,
811
+ "TilingProfiler::SimdInstructionsAfterTiling": 211.0,
812
+ "TilingProfiler::TotalInstructionsAfterTiling": 0.0,
813
+ "TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0,
814
+ "TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0,
815
+ "TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0,
816
+ "TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0,
817
+ "TransformConvOp::conv2d_column_packing": 0.0,
818
+ "TransformConvOp::conv2d_column_packing_1": 0.0,
819
+ "TransformConvOp::conv2d_column_packing_io10": 0.0,
820
+ "TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0
821
+ }
822
+ },
823
+ "sg0002": {
824
+ "compiletime": {
825
+ "AGOrderingAnalysisPass": 0.07081985473632813,
826
+ "AffinePredicateResolution": 0.001847982406616211,
827
+ "AliasDependencyElimination": 0.0017039775848388672,
828
+ "AliasDependencyInduction": 0.016176223754882813,
829
+ "AliasDependencyReset": 0.0533907413482666,
830
+ "BFComputeCutting": 0.002690553665161133,
831
+ "BirCodeGenLoop": 0.436786413192749,
832
+ "CCOpFusion": 0.05509161949157715,
833
+ "CanonicalizeDAGForPGTiling": 0.01196432113647461,
834
+ "CanonicalizeIR": 0.002866029739379883,
835
+ "CoalesceCCOp": 0.00784611701965332,
836
+ "CommuteConcat": 0.0016961097717285156,
837
+ "DMALocalityOpt": 0.006368398666381836,
838
+ "DMAProfiler": 0.016033411026000977,
839
+ "DMATilingProfiler": 0.013326406478881836,
840
+ "DataLocalityOpt": 0.13399314880371094,
841
+ "DataStreaming": 0.005326271057128906,
842
+ "DeConcat": 0.003023386001586914,
843
+ "DeadCodeElimination": 0.006216287612915039,
844
+ "DeadStoreElimination": 0.01400136947631836,
845
+ "DelinearIndices": 0.014129638671875,
846
+ "Delinearization": 0.004580259323120117,
847
+ "DelinearizeSPMD": 0.02204442024230957,
848
+ "DoNothing": 6.771087646484375e-05,
849
+ "DramToDramTranspose": 0.0199737548828125,
850
+ "DumpGraphAndMetadata": 0.037271738052368164,
851
+ "EliminateDivs": 0.0025110244750976563,
852
+ "ExpandBatchNorm": 0.002251148223876953,
853
+ "ExpandISAMacro": 0.0057184696197509766,
854
+ "FactorizeBlkDims": 0.020665884017944336,
855
+ "FactorizeThreadAxesInFreeDims": 0.0031156539916992188,
856
+ "FlattenMacroLoop": 0.005499601364135742,
857
+ "GenericAccessSimplifier": 0.004717350006103516,
858
+ "InferInitValue": 0.046659231185913086,
859
+ "InferIntrinsicOnCC": 0.039793968200683594,
860
+ "InferNeuronTensor": 0.03774452209472656,
861
+ "InferNonlocalTensors": 0.030941486358642578,
862
+ "InferPSumTensor": 0.10350608825683594,
863
+ "InferShardAxis": 0.504509449005127,
864
+ "InferSharedMemLoc": 0.021315813064575195,
865
+ "InlineNativeKernels": 0.00193023681640625,
866
+ "InsertCoreBarrier": 0.008482217788696289,
867
+ "InsertIOTransposes": 0.061508893966674805,
868
+ "InsertImplicitShardAxisBeforeISel": 0.01612401008605957,
869
+ "InsertLocalTransposes": 0.005467414855957031,
870
+ "InsertOffloadedTransposes": 0.025030136108398438,
871
+ "LICM": 0.010097026824951172,
872
+ "LateLegalizeInst": 0.010406017303466797,
873
+ "LateLegalizePostSplit": 0.020189762115478516,
874
+ "LateLowerReshapeOp": 0.0018696784973144531,
875
+ "LateLowerTensorOp": 0.0022716522216796875,
876
+ "LateNeuronInstComb": 0.022235631942749023,
877
+ "LayoutPreprocessing": 0.05716848373413086,
878
+ "LayoutPreprocessingAndAnalysis": 0.12559008598327637,
879
+ "LayoutRequirementAnalysis": 0.01263284683227539,
880
+ "LegalizeCCOpLayout": 0.003709077835083008,
881
+ "LegalizeOpLevelAlias": 0.0016541481018066406,
882
+ "LegalizePartitionReduce": 0.007805347442626953,
883
+ "LegalizeSundaAccess": 0.09120893478393555,
884
+ "LegalizeSundaMacro": 0.020558595657348633,
885
+ "LegalizeType": 0.006526947021484375,
886
+ "LocalLayoutOpt": 0.04371356964111328,
887
+ "LoopFusion": 0.03305792808532715,
888
+ "LoopSplitting": 0.0017974376678466797,
889
+ "LowerBroadcast": 0.005987882614135742,
890
+ "LowerCCOpBlockAxis": 0.013673782348632813,
891
+ "LowerComplexBroadcast": 0.005238771438598633,
892
+ "LowerIntrinsics": 0.04390692710876465,
893
+ "LowerShardAxis": 0.02148151397705078,
894
+ "LowerTensorOp": 0.011847496032714844,
895
+ "LowerToSendRecv": 0.03099536895751953,
896
+ "LowerTranspose": 0.022028207778930664,
897
+ "MacroGeneration": 0.11886835098266602,
898
+ "MaskPropagation": 0.01356053352355957,
899
+ "MemcpyElimination": 0.050164222717285156,
900
+ "MutateDataType": 0.0028362274169921875,
901
+ "NeuronAliasDependencyInduction": 0.0024106502532958984,
902
+ "NeuronAliasDependencyReset": 0.07959818840026855,
903
+ "NeuronInstComb": 0.024571895599365234,
904
+ "NeuronLICM": 0.019634723663330078,
905
+ "NeuronLoopFusion": 0.0700373649597168,
906
+ "NeuronLoopInterchange": 0.003496885299682617,
907
+ "NeuronSimplifier": 0.0175168514251709,
908
+ "NeuronSimplifyPredicates": 0.01945638656616211,
909
+ "NeuronValueNumbering": 0.014354467391967773,
910
+ "OptimizeAliasedCopyChain": 0.0008881092071533203,
911
+ "OptimizeNKIKernels": 4.497897148132324,
912
+ "PAGLayoutOpt": 0.11170005798339844,
913
+ "PComputeCutting": 0.02699899673461914,
914
+ "PGLayoutTilingPipeline": 1.7730352878570557,
915
+ "PGTiling": 0.4928562641143799,
916
+ "PadElimination": 0.0005004405975341797,
917
+ "ParAxesAnnotation": 0.08141517639160156,
918
+ "PartialLoopFusion": 0.05184769630432129,
919
+ "PartialSimdFusion": 0.019034385681152344,
920
+ "PerfectLoopNest": 0.005218982696533203,
921
+ "RecognizeOpIdiom": 0.028120994567871094,
922
+ "Recompute": 0.0006320476531982422,
923
+ "RelaxPredicates": 0.012555122375488281,
924
+ "Rematerialization": 0.002846240997314453,
925
+ "RemoveShardedPartitionAxes": 0.028553009033203125,
926
+ "ReshapeWeights": 0.0013833045959472656,
927
+ "ResolveAccessConflict": 0.007452726364135742,
928
+ "ResolveComplicatePredicates": 0.002027273178100586,
929
+ "RewriteReplicationMatmul": 0.0019905567169189453,
930
+ "RewriteWeights": 0.005997419357299805,
931
+ "SFKVectorizer": 0.20844674110412598,
932
+ "ShardingPropagationAnalysis": 0.11750531196594238,
933
+ "SimpleAllReduceTiling": 0.0042400360107421875,
934
+ "Simplifier": 0.01620769500732422,
935
+ "SimplifyMacroPredicates": 0.03200030326843262,
936
+ "SimplifyNeuronTensor": 0.016496896743774414,
937
+ "SimplifySlice": 0.002093076705932617,
938
+ "SimplifyTensor": 0.01188349723815918,
939
+ "SpillPSum": 0.019929170608520508,
940
+ "SplitAPUnionSets": 0.09830927848815918,
941
+ "SplitAccGrp": 0.003184795379638672,
942
+ "StaticProfiler": 0.024499177932739258,
943
+ "StaticTransposeLocalTensor": 0.013921499252319336,
944
+ "SundaISel": 0.12911200523376465,
945
+ "TCTransform": 0.01076197624206543,
946
+ "TensorInitialization": 0.015585660934448242,
947
+ "TensorOpSimplifier": 0.009182214736938477,
948
+ "TensorOpTransform": 0.02479076385498047,
949
+ "TileCCOps": 0.01529073715209961,
950
+ "TilingProfiler": 0.02448558807373047,
951
+ "TransformConvOp": 0.0032668113708496094,
952
+ "TritiumFusion": 0.07947993278503418,
953
+ "ValueNumbering": 0.008611917495727539,
954
+ "VectorizeDMA": 0.008882284164428711,
955
+ "VectorizeMatMult": 0.013601303100585938,
956
+ "WeightCoalescing": 0.0029730796813964844,
957
+ "ZeroSizeTensorElimination": 0.00017452239990234375
958
+ },
959
+ "tensorizer": {
960
+ "DMATilingProfiler::TotalInstructionsAfterTiling": 22051.0,
961
+ "StaticProfiler::AifUb": 173.52798461914063,
962
+ "StaticProfiler::ArithmeticIntensityTensorizer": 150.2424774169922,
963
+ "StaticProfiler::AverageDmaLength": 2589.193359375,
964
+ "StaticProfiler::AverageFractalPeUtilization": 98.77135467529297,
965
+ "StaticProfiler::AveragePartitionUtilization": 94.32398223876953,
966
+ "StaticProfiler::AveragePeUtilization": 96.75625610351563,
967
+ "StaticProfiler::DDRTransferBytes": 407886880.0,
968
+ "StaticProfiler::InternalTransferBytes": 327079712.0,
969
+ "StaticProfiler::LoadExpanded": 89436.0,
970
+ "StaticProfiler::LocalizationEfficiency": 86.58112335205078,
971
+ "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 89.48306274414063,
972
+ "StaticProfiler::StoreExpanded": 2154.0,
973
+ "StaticProfiler::TotalDMAExpanded": 91590.0,
974
+ "StaticProfiler::TotalDynamicInstancesCount": 26447.0,
975
+ "StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 25996.0,
976
+ "StaticProfiler::TotalLNCComm": 0.0,
977
+ "StaticProfiler::TotalLNCCommTransfer": 0.0,
978
+ "TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0,
979
+ "TilingProfiler::AveragePeUtilizationAfterTiling": 0.0,
980
+ "TilingProfiler::BatchnormInstructionsAfterTiling": 0.0,
981
+ "TilingProfiler::DmaInstructionsAfterTiling": 0.0,
982
+ "TilingProfiler::GenericInstructionsAfterTiling": 4.0,
983
+ "TilingProfiler::MatMultInstructionsAfterTiling": 11424.0,
984
+ "TilingProfiler::NumPfTransposes": 6.0,
985
+ "TilingProfiler::NumPfTransposesForIo": 1.0,
986
+ "TilingProfiler::NumPfTransposesForLocal": 1.0,
987
+ "TilingProfiler::NumPfTransposesForNonlocal": 4.0,
988
+ "TilingProfiler::PfTransposeInstructions": 10291.0,
989
+ "TilingProfiler::PfTransposeInstructionsForIo": 9504.0,
990
+ "TilingProfiler::PfTransposeInstructionsForLocal": 1.0,
991
+ "TilingProfiler::PfTransposeInstructionsForNonlocal": 786.0,
992
+ "TilingProfiler::ReduceInstructionsAfterTiling": 4.0,
993
+ "TilingProfiler::SimdInstructionsAfterTiling": 164.0,
994
+ "TilingProfiler::TotalInstructionsAfterTiling": 0.0,
995
+ "TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0,
996
+ "TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0,
997
+ "TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0,
998
+ "TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0,
999
+ "TransformConvOp::conv2d_column_packing": 0.0,
1000
+ "TransformConvOp::conv2d_column_packing_1": 0.0,
1001
+ "TransformConvOp::conv2d_column_packing_io10": 0.0,
1002
+ "TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0
1003
+ }
1004
+ },
1005
+ "sg01": {
1006
+ "compiletime": {
1007
+ "CanonicalizeConv": 9.999999974752427e-07,
1008
+ "CanonicalizeForTensorizer": 1.4000000192027073e-05,
1009
+ "Canonicalizer": 0.00020799999765586108,
1010
+ "HoistCompute": 4.999999873689376e-06,
1011
+ "IdentifyCrossPassTensors": 1.5999999959603883e-05,
1012
+ "MemcastMotion": 7.000000096013537e-06,
1013
+ "PenguinizeFunctions": 1.2000000424450263e-05,
1014
+ "PruneFunctions": 1.5999999959603883e-05,
1015
+ "RemoveOptimizationBarriers": 2.2000000171829015e-05,
1016
+ "ScatterMotion": 1.1000000085914508e-05,
1017
+ "TensorizerLegalizationPass": 1.700000029813964e-05,
1018
+ "VerifySupportedOps": 9.999999747378752e-06,
1019
+ "algsimp": 4.5000000682193786e-05,
1020
+ "batchnorm_expander": 1.2000000424450263e-05,
1021
+ "boundary-marker-removal": 3.999999989900971e-06,
1022
+ "call-inliner": 7.999999979801942e-06,
1023
+ "canonicalize-boundary-marker": 4.999999873689376e-06,
1024
+ "collective-stream-id-checker": 3.000000106112566e-06,
1025
+ "comparison-expander": 4.999999873689376e-06,
1026
+ "computation-deduplicator": 1.8999999156221747e-05,
1027
+ "config-lowering": 2.8000000384054147e-05,
1028
+ "constant_folding": 7.000000096013537e-06,
1029
+ "cse": 9.999999747378752e-06,
1030
+ "dce": 9.999999974752427e-07,
1031
+ "dynamic-slice-transpose": 3.999999989900971e-06,
1032
+ "eliminate-redundant-compare": 3.000000106112566e-06,
1033
+ "emit-offloaded-dropout": 1.1000000085914508e-05,
1034
+ "flatten-call-graph": 6.000000212225132e-06,
1035
+ "fuse-send-recv": 1.8999999156221747e-05,
1036
+ "hilo-conditional-to-select": 3.999999989900971e-06,
1037
+ "hilo::LegalizeAlias": 4.999999873689376e-06,
1038
+ "hilo::NeuronInstCombine": 5.199999941396527e-05,
1039
+ "hilo::NeuronOpFusion": 1.1000000085914508e-05,
1040
+ "hilo::ReplaceTokenTypeWithU8Pass": 6.000000212225132e-06,
1041
+ "hilo::ScheduleFusion": 9.999999974752427e-07,
1042
+ "hilo::SixtyFourHack": 1.2000000424450263e-05,
1043
+ "hilo::VerifyAliasing": 1.9999999949504854e-06,
1044
+ "hlo-mac-count": 9.699999645818025e-05,
1045
+ "legalize-ccops-for-tensorizer": 9.999999974752427e-07,
1046
+ "legalize-compare": 3.999999989900971e-06,
1047
+ "lower-argminmax-custom-call": 3.000000106112566e-06,
1048
+ "map-inline": 9.999999747378752e-06,
1049
+ "metadata-naming": 1.8000000636675395e-05,
1050
+ "mlir::detail::OpToOpPassAdaptor": 1.9999999494757503e-05,
1051
+ "mlir::hlo::MhloToPyPenguin": 0.001829999964684248,
1052
+ "mlir::mhlo::LowerComplexExtraPass": 0.00011999999696854502,
1053
+ "mlir::mhlo::LowerComplexPass": 0.0001849999971454963,
1054
+ "native-to-custom-softmax": 4.999999873689376e-06,
1055
+ "native-to-custom-softmax-dx": 1.2000000424450263e-05,
1056
+ "neuron-hlo-verifier": 0.00036700000055134296,
1057
+ "operand_upcaster": 1.4000000192027073e-05,
1058
+ "post-par-pipe-begin": 9.999999974752427e-07,
1059
+ "post-par-pipe-end": 0.0,
1060
+ "post-partition-simplification": 0.0004250000056345016,
1061
+ "replace-minimum-constant": 4.999999873689376e-06,
1062
+ "reshape-mover": 1.9999999949504854e-06,
1063
+ "simplify-concat": 3.5000000934815034e-05,
1064
+ "simplify-while-loops": 1.9999999949504854e-06,
1065
+ "transform-variadic-reduce": 7.000000096013537e-06,
1066
+ "tuple-simplifier": 3.999999989900971e-06,
1067
+ "unpack-nested-aws-ntwsr": 3.000000106112566e-06,
1068
+ "unroll-while-loop": 0.0
1069
+ },
1070
+ "hilo": {
1071
+ "ArithmeticIntensity": 201.87655639648438,
1072
+ "HloMacCount": 13153337344.0,
1073
+ "Traffic": 130310688.0
1074
+ }
1075
+ },
1076
+ "sg02": {
1077
+ "compiletime": {
1078
+ "CanonicalizeConv": 9.000000318337698e-06,
1079
+ "CanonicalizeForTensorizer": 1.2000000424450263e-05,
1080
+ "Canonicalizer": 0.0002739999908953905,
1081
+ "HoistCompute": 0.0,
1082
+ "IdentifyCrossPassTensors": 1.4999999621068127e-05,
1083
+ "MemcastMotion": 9.999999974752427e-07,
1084
+ "PenguinizeFunctions": 1.1000000085914508e-05,
1085
+ "PruneFunctions": 7.999999979801942e-06,
1086
+ "RemoveOptimizationBarriers": 3.899999865097925e-05,
1087
+ "ScatterMotion": 3.000000106112566e-06,
1088
+ "TensorizerLegalizationPass": 6.000000212225132e-06,
1089
+ "VerifySupportedOps": 1.2000000424450263e-05,
1090
+ "algsimp": 4.8000001697801054e-05,
1091
+ "batchnorm_expander": 1.1000000085914508e-05,
1092
+ "boundary-marker-removal": 3.000000106112566e-06,
1093
+ "call-inliner": 9.999999747378752e-06,
1094
+ "canonicalize-boundary-marker": 3.999999989900971e-06,
1095
+ "collective-stream-id-checker": 3.000000106112566e-06,
1096
+ "comparison-expander": 4.999999873689376e-06,
1097
+ "computation-deduplicator": 1.8999999156221747e-05,
1098
+ "config-lowering": 3.600000127335079e-05,
1099
+ "constant_folding": 7.000000096013537e-06,
1100
+ "cse": 1.2000000424450263e-05,
1101
+ "dce": 9.999999974752427e-07,
1102
+ "dynamic-slice-transpose": 3.999999989900971e-06,
1103
+ "eliminate-redundant-compare": 3.000000106112566e-06,
1104
+ "emit-offloaded-dropout": 1.1000000085914508e-05,
1105
+ "flatten-call-graph": 9.999999747378752e-06,
1106
+ "fuse-send-recv": 1.700000029813964e-05,
1107
+ "hilo-conditional-to-select": 4.999999873689376e-06,
1108
+ "hilo::LegalizeAlias": 1.9999999949504854e-06,
1109
+ "hilo::NeuronInstCombine": 4.400000034365803e-05,
1110
+ "hilo::NeuronOpFusion": 4.999999873689376e-06,
1111
+ "hilo::ReplaceTokenTypeWithU8Pass": 1.2999999853491317e-05,
1112
+ "hilo::ScheduleFusion": 3.999999989900971e-06,
1113
+ "hilo::SixtyFourHack": 4.099999932805076e-05,
1114
+ "hilo::VerifyAliasing": 9.999999974752427e-07,
1115
+ "hlo-mac-count": 0.004902000073343515,
1116
+ "legalize-ccops-for-tensorizer": 9.999999974752427e-07,
1117
+ "legalize-compare": 3.999999989900971e-06,
1118
+ "lower-argminmax-custom-call": 3.000000106112566e-06,
1119
+ "map-inline": 9.999999747378752e-06,
1120
+ "metadata-naming": 1.4000000192027073e-05,
1121
+ "mlir::detail::OpToOpPassAdaptor": 2.499999936844688e-05,
1122
+ "mlir::hlo::MhloToPyPenguin": 0.005096000153571367,
1123
+ "mlir::mhlo::LowerComplexExtraPass": 9.40000027185306e-05,
1124
+ "mlir::mhlo::LowerComplexPass": 0.00016599999798927456,
1125
+ "native-to-custom-softmax": 6.000000212225132e-06,
1126
+ "native-to-custom-softmax-dx": 2.4000000848900527e-05,
1127
+ "neuron-hlo-verifier": 0.00033099998836405575,
1128
+ "operand_upcaster": 1.5999999959603883e-05,
1129
+ "post-par-pipe-begin": 3.999999989900971e-06,
1130
+ "post-par-pipe-end": 0.0,
1131
+ "post-partition-simplification": 0.00046400001156143844,
1132
+ "replace-minimum-constant": 7.000000096013537e-06,
1133
+ "reshape-mover": 1.9999999949504854e-06,
1134
+ "simplify-concat": 3.199999991920777e-05,
1135
+ "simplify-while-loops": 1.9999999949504854e-06,
1136
+ "transform-variadic-reduce": 4.5000000682193786e-05,
1137
+ "tuple-simplifier": 3.999999989900971e-06,
1138
+ "unpack-nested-aws-ntwsr": 3.999999989900971e-06,
1139
+ "unroll-while-loop": 0.0
1140
+ },
1141
+ "hilo": {
1142
+ "ArithmeticIntensity": 55.24231719970703,
1143
+ "HloMacCount": 9820307456.0,
1144
+ "Traffic": 355535680.0
1145
+ }
1146
+ },
1147
+ "topk": {
1148
+ "compiletime": {
1149
+ "CoalesceCCOp": 0.012721538543701172,
1150
+ "DMALocalityOpt": 0.00609898567199707,
1151
+ "DMAProfiler": 0.007831335067749023,
1152
+ "DataStreaming": 0.01673150062561035,
1153
+ "DoNothing": 0.0002722740173339844,
1154
+ "ExpandISAMacro": 0.0056455135345458984,
1155
+ "FactorizeBlkDims": 0.0197756290435791,
1156
+ "InferPSumTensor": 0.023047685623168945,
1157
+ "InferSharedMemLoc": 0.011858940124511719,
1158
+ "InsertCoreBarrier": 0.011088848114013672,
1159
+ "LateLegalizeInst": 0.02294301986694336,
1160
+ "LateNeuronInstComb": 0.03573012351989746,
1161
+ "LegalizeSundaAccess": 0.04056549072265625,
1162
+ "LegalizeType": 0.036716461181640625,
1163
+ "LowerBroadcast": 0.009067773818969727,
1164
+ "LowerIntrinsics": 0.0156552791595459,
1165
+ "LowerTranspose": 0.004080295562744141,
1166
+ "NeuronInstComb": 0.030441999435424805,
1167
+ "NeuronLICM": 0.03961777687072754,
1168
+ "NeuronSimplifyPredicates": 0.012285470962524414,
1169
+ "NeuronValueNumbering": 0.007288455963134766,
1170
+ "SFKVectorizer": 0.06282949447631836,
1171
+ "SimpleAllReduceTiling": 0.016891002655029297,
1172
+ "SimplifyNeuronTensor": 0.08206772804260254,
1173
+ "SpillPSum": 0.045392751693725586,
1174
+ "WeightCoalescing": 0.0072481632232666016
1175
+ }
1176
+ }
1177
+ }
context_encoding_model/_tp0_bk2/graph.neff ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:06ba2911f0e007b1f4ad7d888115d6589d3bf2b988bbc6b3bc84a1db0766bb48
3
+ size 1342464
context_encoding_model/_tp0_bk2/log-neuron-cc.txt ADDED
The diff for this file is too large to render. See raw diff
 
context_encoding_model/_tp0_bk2/metaneff.pb ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e16c9f7e6763d8d2b02577a4b90bcb120069c7fe5bb1001520c159d08abf614c
3
+ size 2610412
context_encoding_model/_tp0_bk2/model.MODULE_49bb42f69f5b159ae769+3467f95e.hlo_module.pb ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c22ff4f27dafd3772342a93352c9b5a2c076d1824cec83419ac3d1f8c07d4e2f
3
+ size 2697198
context_encoding_model/_tp0_bk2/model.MODULE_49bb42f69f5b159ae769+3467f95e.neff ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:06ba2911f0e007b1f4ad7d888115d6589d3bf2b988bbc6b3bc84a1db0766bb48
3
+ size 1342464
context_encoding_model/_tp0_bk2/neuron_config.json ADDED
@@ -0,0 +1,224 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_attn_implementation_autoset": false,
3
+ "_name_or_path": "/home/ubuntu/models/Qwen3-1.7B",
4
+ "add_cross_attention": false,
5
+ "architectures": [
6
+ "Qwen3ForCausalLM"
7
+ ],
8
+ "attention_bias": false,
9
+ "attention_dropout": 0.0,
10
+ "attribute_map": {},
11
+ "bad_words_ids": null,
12
+ "begin_suppress_tokens": null,
13
+ "bos_token_id": 151643,
14
+ "chunk_size_feed_forward": 0,
15
+ "cross_attention_hidden_size": null,
16
+ "decoder_start_token_id": null,
17
+ "diversity_penalty": 0.0,
18
+ "do_sample": false,
19
+ "early_stopping": false,
20
+ "encoder_no_repeat_ngram_size": 0,
21
+ "eos_token_id": 151645,
22
+ "exponential_decay_length_penalty": null,
23
+ "finetuning_task": null,
24
+ "forced_bos_token_id": null,
25
+ "forced_eos_token_id": null,
26
+ "fused_spec_config": null,
27
+ "head_dim": 128,
28
+ "hidden_act": "silu",
29
+ "hidden_size": 2048,
30
+ "id2label": {
31
+ "0": "LABEL_0",
32
+ "1": "LABEL_1"
33
+ },
34
+ "initializer_range": 0.02,
35
+ "intermediate_size": 6144,
36
+ "is_decoder": false,
37
+ "is_encoder_decoder": false,
38
+ "label2id": {
39
+ "LABEL_0": 0,
40
+ "LABEL_1": 1
41
+ },
42
+ "length_penalty": 1.0,
43
+ "max_length": 20,
44
+ "max_position_embeddings": 40960,
45
+ "max_window_layers": 28,
46
+ "metadata": null,
47
+ "min_length": 0,
48
+ "model_type": "qwen3",
49
+ "neuron_config": {
50
+ "activation_quantization_type": null,
51
+ "allow_input_truncation": false,
52
+ "apply_seq_ids_mask": false,
53
+ "async_mode": false,
54
+ "attention_dp_degree": 1,
55
+ "attention_dtype": null,
56
+ "attn_block_cte_nki_kernel_enabled": false,
57
+ "attn_block_tkg_nki_kernel_cache_update": false,
58
+ "attn_block_tkg_nki_kernel_cascaded_attention": false,
59
+ "attn_block_tkg_nki_kernel_enabled": false,
60
+ "attn_cls": {
61
+ "__module__": "neuronx_distributed_inference.models.qwen3.modeling_qwen3",
62
+ "__name__": "NeuronQwen3Attention"
63
+ },
64
+ "attn_kernel_enabled": null,
65
+ "attn_tkg_builtin_kernel_enabled": false,
66
+ "attn_tkg_nki_kernel_enabled": false,
67
+ "batch_size": 1,
68
+ "bucket_n_active_tokens": true,
69
+ "buckets": [
70
+ 512
71
+ ],
72
+ "cast_type": "config",
73
+ "cc_pipeline_tiling_factor": 2,
74
+ "chunked_prefill_config": null,
75
+ "context_encoding_buckets": [
76
+ 512
77
+ ],
78
+ "cp_degree": 1,
79
+ "ctx_batch_size": 1,
80
+ "disable_kv_cache_tiling": false,
81
+ "draft_model_modules_to_not_convert": null,
82
+ "enable_bucketing": true,
83
+ "enable_cte_modular_flow": false,
84
+ "enable_eagle_draft_input_norm": false,
85
+ "enable_eagle_speculation": false,
86
+ "enable_fused_speculation": false,
87
+ "enable_long_context_mode": false,
88
+ "enable_output_completion_notifications": false,
89
+ "enable_spill_reload_dge": false,
90
+ "enable_token_tree": false,
91
+ "ep_degree": 1,
92
+ "expert_mlp_nki_kernel_enabled": null,
93
+ "flash_decoding_enabled": false,
94
+ "fused_qkv": false,
95
+ "fused_rmsnorm_skip_gamma": false,
96
+ "is_block_kv_layout": null,
97
+ "is_chunked_prefill": false,
98
+ "is_continuous_batching": true,
99
+ "is_eagle_draft": false,
100
+ "is_medusa": false,
101
+ "is_prefill_stage": true,
102
+ "is_prefix_caching": false,
103
+ "k_cache_transposed": false,
104
+ "kv_cache_batch_size": 8,
105
+ "kv_cache_padding_size": 0,
106
+ "kv_cache_quant": false,
107
+ "kv_cache_tiling": false,
108
+ "layer_boundary_markers": false,
109
+ "lm_head_pad": true,
110
+ "lm_head_pad_alignment_size": 1,
111
+ "local_ranks_size": 2,
112
+ "logical_nc_config": 2,
113
+ "lora_config": null,
114
+ "max_batch_size": 8,
115
+ "max_context_length": 4096,
116
+ "max_length": 4096,
117
+ "max_new_tokens": null,
118
+ "medusa_speculation_length": 0,
119
+ "medusa_tree": null,
120
+ "mlp_kernel_enabled": false,
121
+ "mlp_kernel_fuse_residual_add": false,
122
+ "modules_to_not_convert": null,
123
+ "moe_fused_nki_kernel_enabled": null,
124
+ "n_active_tokens": 4096,
125
+ "n_positions": 4096,
126
+ "num_medusa_heads": 0,
127
+ "on_cpu": false,
128
+ "on_device_sampling_config": {
129
+ "deterministic": false,
130
+ "do_sample": false,
131
+ "dynamic": true,
132
+ "global_topk": 256,
133
+ "on_device_sampling_config": true,
134
+ "temperature": 1.0,
135
+ "top_k": 1,
136
+ "top_k_kernel_enabled": false,
137
+ "top_p": 1.0
138
+ },
139
+ "output_logits": false,
140
+ "overrides_torch_dtype": true,
141
+ "pa_block_size": 4096,
142
+ "pa_num_blocks": 8,
143
+ "padding_side": "right",
144
+ "pp_degree": 1,
145
+ "prefix_buckets": null,
146
+ "qk_layernorm": false,
147
+ "qkv_kernel_enabled": false,
148
+ "qkv_kernel_fuse_residual_add": false,
149
+ "qkv_kernel_nbsd_layout": false,
150
+ "quantization_dtype": "int8",
151
+ "quantization_type": "per_tensor_symmetric",
152
+ "quantize_clamp_bound": Infinity,
153
+ "quantized": false,
154
+ "quantized_checkpoints_path": null,
155
+ "quantized_mlp_kernel_enabled": false,
156
+ "rmsnorm_quantize_kernel_enabled": false,
157
+ "router_topk_nki_kernel_enabled": null,
158
+ "rpl_reduce_dtype": null,
159
+ "save_sharded_checkpoint": true,
160
+ "scratchpad_page_size": null,
161
+ "seq_len": 4096,
162
+ "seq_len_threshold_for_cc_tiling": 16384,
163
+ "sequence_parallel_enabled": false,
164
+ "shared_mlp_nki_kernel_enabled": null,
165
+ "skip_sharding": false,
166
+ "skip_warmup": false,
167
+ "spec_batch_size": 8,
168
+ "speculation_length": 0,
169
+ "start_rank_id": 0,
170
+ "strided_context_parallel_kernel_enabled": false,
171
+ "target": null,
172
+ "tensor_capture_config": null,
173
+ "tile_cc": false,
174
+ "tkg_batch_size": 8,
175
+ "token_generation_buckets": null,
176
+ "token_tree_config": null,
177
+ "torch_dtype": "bfloat16",
178
+ "tp_degree": 2,
179
+ "vocab_parallel": false,
180
+ "weight_gather_seq_len_threshold": 32768,
181
+ "weights_to_skip_layout_optimization": [],
182
+ "world_size": 2
183
+ },
184
+ "no_repeat_ngram_size": 0,
185
+ "num_attention_heads": 16,
186
+ "num_beam_groups": 1,
187
+ "num_beams": 1,
188
+ "num_cores_per_group": 1,
189
+ "num_hidden_layers": 28,
190
+ "num_key_value_heads": 8,
191
+ "num_return_sequences": 1,
192
+ "output_attentions": false,
193
+ "output_hidden_states": false,
194
+ "output_scores": false,
195
+ "pad_token_id": 0,
196
+ "prefix": null,
197
+ "problem_type": null,
198
+ "pruned_heads": {},
199
+ "remove_invalid_values": false,
200
+ "repetition_penalty": 1.0,
201
+ "return_dict": true,
202
+ "return_dict_in_generate": false,
203
+ "rms_norm_eps": 1e-06,
204
+ "rope_scaling": null,
205
+ "rope_theta": 1000000,
206
+ "sep_token_id": null,
207
+ "sliding_window": null,
208
+ "suppress_tokens": null,
209
+ "task_specific_params": null,
210
+ "temperature": 1.0,
211
+ "tf_legacy_loss": false,
212
+ "tie_encoder_decoder": false,
213
+ "tie_word_embeddings": true,
214
+ "tokenizer_class": null,
215
+ "top_k": 50,
216
+ "top_p": 1.0,
217
+ "torchscript": false,
218
+ "transformers_version": "4.51.0",
219
+ "typical_p": 1.0,
220
+ "use_bfloat16": false,
221
+ "use_cache": true,
222
+ "use_sliding_window": false,
223
+ "vocab_size": 151936
224
+ }
context_encoding_model/_tp0_bk3/command.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ neuronx-cc compile --framework=XLA model.MODULE_be035899334776123ed5+d208bdce.hlo_module.pb --output model.MODULE_be035899334776123ed5+d208bdce.neff --target=trn2 --auto-cast=none --model-type=transformer '--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma ' --lnc=2 -O1 '--internal-hlo2tensorizer-options= --modular-flow-mac-threshold=10 --verify-hlo=true' --logfile=log-neuron-cc.txt --verbose=35
context_encoding_model/_tp0_bk3/compile_flags.MODULE_be035899334776123ed5+d208bdce.json ADDED
@@ -0,0 +1 @@
 
 
1
+ ["--target=trn2", "--auto-cast=none", "--model-type=transformer", "--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma ", "--lnc=2", "-O1", "--internal-hlo2tensorizer-options= --modular-flow-mac-threshold=10 --verify-hlo=true", "--logfile=/home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk3/log-neuron-cc.txt"]
context_encoding_model/_tp0_bk3/global_metric_store.json ADDED
@@ -0,0 +1,1177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "Average": {
3
+ "tensorizer": {
4
+ "StaticProfiler::AverageFractalPeUtilization": 98.80319213867188,
5
+ "StaticProfiler::AveragePartitionUtilization": 94.51075744628906,
6
+ "StaticProfiler::AveragePeUtilization": 96.83863067626953,
7
+ "StaticProfiler::LocalizationEfficiency": 84.98564147949219,
8
+ "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 89.59233093261719,
9
+ "TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0,
10
+ "TilingProfiler::AveragePeUtilizationAfterTiling": 0.0
11
+ }
12
+ },
13
+ "Count": {
14
+ "tensorizer": {
15
+ "StaticProfiler::AverageFractalPeUtilization": 1.0,
16
+ "StaticProfiler::AveragePartitionUtilization": 1.0,
17
+ "StaticProfiler::AveragePeUtilization": 1.0,
18
+ "StaticProfiler::LocalizationEfficiency": 1.0,
19
+ "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 1.0,
20
+ "TilingProfiler::AveragePartitionUtilizationAfterTiling": 1.0,
21
+ "TilingProfiler::AveragePeUtilizationAfterTiling": 1.0
22
+ }
23
+ },
24
+ "Sum": {
25
+ "compiletime": {
26
+ "AGOrderingAnalysisPass": 0.057534217834472656,
27
+ "AffinePredicateResolution": 0.0009605884552001953,
28
+ "AliasDependencyElimination": 0.00025153160095214844,
29
+ "AliasDependencyInduction": 0.006276607513427734,
30
+ "AliasDependencyReset": 0.027743816375732422,
31
+ "BFComputeCutting": 0.0031321048736572266,
32
+ "BirCodeGenLoop": 0.5169932842254639,
33
+ "CCOpFusion": 0.05496716499328613,
34
+ "CanonicalizeConv": 1.8000000636675395e-05,
35
+ "CanonicalizeDAGForPGTiling": 0.010706663131713867,
36
+ "CanonicalizeForTensorizer": 3.7000001611886546e-05,
37
+ "CanonicalizeIR": 0.00154876708984375,
38
+ "Canonicalizer": 0.0007949999999254942,
39
+ "CoalesceCCOp": 0.0278623104095459,
40
+ "CommuteConcat": 0.001708984375,
41
+ "DMALocalityOpt": 0.010039329528808594,
42
+ "DMAProfiler": 0.031324148178100586,
43
+ "DMATilingProfiler": 0.011522531509399414,
44
+ "DataLocalityOpt": 0.28015780448913574,
45
+ "DataStreaming": 0.031224727630615234,
46
+ "DeConcat": 0.002462148666381836,
47
+ "DeadCodeElimination": 0.0021996498107910156,
48
+ "DeadStoreElimination": 0.007483243942260742,
49
+ "DelinearIndices": 0.008810281753540039,
50
+ "Delinearization": 0.009731292724609375,
51
+ "DelinearizeSPMD": 0.04425859451293945,
52
+ "DoNothing": 0.006867170333862305,
53
+ "DramToDramTranspose": 0.012907743453979492,
54
+ "DumpGraphAndMetadata": 0.07597684860229492,
55
+ "EliminateDivs": 0.0021903514862060547,
56
+ "ExpandBatchNorm": 0.001527547836303711,
57
+ "ExpandISAMacro": 0.024112701416015625,
58
+ "FactorizeBlkDims": 0.05227327346801758,
59
+ "FactorizeThreadAxesInFreeDims": 0.003031015396118164,
60
+ "FlattenMacroLoop": 0.004990577697753906,
61
+ "GenericAccessSimplifier": 0.0007598400115966797,
62
+ "HoistCompute": 1.2000000424450263e-05,
63
+ "IdentifyCrossPassTensors": 5.0000002374872565e-05,
64
+ "InferInitValue": 0.10130023956298828,
65
+ "InferIntrinsicOnCC": 0.007919549942016602,
66
+ "InferNeuronTensor": 0.05837249755859375,
67
+ "InferNonlocalTensors": 0.05706453323364258,
68
+ "InferPSumTensor": 0.06946349143981934,
69
+ "InferShardAxis": 0.4604020118713379,
70
+ "InferSharedMemLoc": 0.05161857604980469,
71
+ "InlineNativeKernels": 0.006569623947143555,
72
+ "InsertCoreBarrier": 0.018887758255004883,
73
+ "InsertIOTransposes": 0.0684211254119873,
74
+ "InsertImplicitShardAxisBeforeISel": 0.01549673080444336,
75
+ "InsertLocalTransposes": 0.022176742553710938,
76
+ "InsertOffloadedTransposes": 0.0181121826171875,
77
+ "LICM": 0.007555484771728516,
78
+ "LateLegalizeInst": 0.0287015438079834,
79
+ "LateLegalizePostSplit": 0.01993083953857422,
80
+ "LateLowerReshapeOp": 0.0016782283782958984,
81
+ "LateLowerTensorOp": 0.0021178722381591797,
82
+ "LateNeuronInstComb": 0.05098986625671387,
83
+ "LayoutPreprocessing": 0.10170960426330566,
84
+ "LayoutPreprocessingAndAnalysis": 0.23344039916992188,
85
+ "LayoutRequirementAnalysis": 0.032952308654785156,
86
+ "LegalizeCCOpLayout": 0.002583742141723633,
87
+ "LegalizeOpLevelAlias": 0.002170562744140625,
88
+ "LegalizePartitionReduce": 0.0025551319122314453,
89
+ "LegalizeSundaAccess": 0.1115577220916748,
90
+ "LegalizeSundaMacro": 0.04086017608642578,
91
+ "LegalizeType": 0.033699750900268555,
92
+ "LocalLayoutOpt": 0.023218154907226563,
93
+ "LoopFusion": 0.005990266799926758,
94
+ "LoopSplitting": 0.0007989406585693359,
95
+ "LowerBroadcast": 0.011745214462280273,
96
+ "LowerCCOpBlockAxis": 0.007201671600341797,
97
+ "LowerComplexBroadcast": 0.00890207290649414,
98
+ "LowerIntrinsics": 0.10557985305786133,
99
+ "LowerShardAxis": 0.023633956909179688,
100
+ "LowerTensorOp": 0.03027796745300293,
101
+ "LowerToSendRecv": 0.027859210968017578,
102
+ "LowerTranspose": 0.028818368911743164,
103
+ "MacroGeneration": 0.12761783599853516,
104
+ "MaskPropagation": 0.01400303840637207,
105
+ "MemcastMotion": 2.7999998565064743e-05,
106
+ "MemcpyElimination": 0.03596854209899902,
107
+ "MutateDataType": 0.0020971298217773438,
108
+ "NeuronAliasDependencyInduction": 0.0019202232360839844,
109
+ "NeuronAliasDependencyReset": 0.027405738830566406,
110
+ "NeuronInstComb": 0.048494815826416016,
111
+ "NeuronLICM": 0.052613019943237305,
112
+ "NeuronLoopFusion": 0.06255030632019043,
113
+ "NeuronLoopInterchange": 0.002681255340576172,
114
+ "NeuronSimplifier": 0.01907205581665039,
115
+ "NeuronSimplifyPredicates": 0.04273796081542969,
116
+ "NeuronValueNumbering": 0.019763708114624023,
117
+ "OptimizeAliasedCopyChain": 0.0005273818969726563,
118
+ "OptimizeNKIKernels": 4.391921043395996,
119
+ "PAGLayoutOpt": 0.16190624237060547,
120
+ "PComputeCutting": 0.016373872756958008,
121
+ "PGLayoutTilingPipeline": 2.0541465282440186,
122
+ "PGTiling": 0.3632845878601074,
123
+ "PadElimination": 0.0006501674652099609,
124
+ "ParAxesAnnotation": 0.08851456642150879,
125
+ "PartialLoopFusion": 0.05034661293029785,
126
+ "PartialSimdFusion": 0.014182329177856445,
127
+ "PenguinizeFunctions": 3.899999865097925e-05,
128
+ "PerfectLoopNest": 0.0036270618438720703,
129
+ "PruneFunctions": 3.7999998312443495e-05,
130
+ "RecognizeOpIdiom": 0.007064342498779297,
131
+ "Recompute": 0.00046062469482421875,
132
+ "RelaxPredicates": 0.02269601821899414,
133
+ "Rematerialization": 0.0019779205322265625,
134
+ "RemoveOptimizationBarriers": 4.400000034365803e-05,
135
+ "RemoveShardedPartitionAxes": 0.014830350875854492,
136
+ "ReshapeWeights": 0.0021474361419677734,
137
+ "ResolveAccessConflict": 0.007428646087646484,
138
+ "ResolveComplicatePredicates": 0.001834869384765625,
139
+ "RewriteReplicationMatmul": 0.006201982498168945,
140
+ "RewriteWeights": 0.004793643951416016,
141
+ "SFKVectorizer": 0.41699957847595215,
142
+ "ScatterMotion": 3.80000019504223e-05,
143
+ "ShardingPropagationAnalysis": 0.2801475524902344,
144
+ "SimpleAllReduceTiling": 0.025059938430786133,
145
+ "Simplifier": 0.003251314163208008,
146
+ "SimplifyMacroPredicates": 0.03280019760131836,
147
+ "SimplifyNeuronTensor": 0.14811110496520996,
148
+ "SimplifySlice": 0.0008628368377685547,
149
+ "SimplifyTensor": 0.014911413192749023,
150
+ "SpillPSum": 0.0687708854675293,
151
+ "SplitAPUnionSets": 0.09714126586914063,
152
+ "SplitAccGrp": 0.006166219711303711,
153
+ "StaticProfiler": 0.021403789520263672,
154
+ "StaticTransposeLocalTensor": 0.02319931983947754,
155
+ "SundaISel": 0.07143282890319824,
156
+ "TCTransform": 0.001344442367553711,
157
+ "TensorInitialization": 0.020877599716186523,
158
+ "TensorOpSimplifier": 0.0060787200927734375,
159
+ "TensorOpTransform": 0.03784608840942383,
160
+ "TensorizerLegalizationPass": 5.0000002374872565e-05,
161
+ "TileCCOps": 0.005100250244140625,
162
+ "TilingProfiler": 0.02941441535949707,
163
+ "TransformConvOp": 0.005896091461181641,
164
+ "TritiumFusion": 0.08978962898254395,
165
+ "ValueNumbering": 0.0032432079315185547,
166
+ "VectorizeDMA": 0.005987644195556641,
167
+ "VectorizeMatMult": 0.019278526306152344,
168
+ "VerifySupportedOps": 3.600000127335079e-05,
169
+ "WeightCoalescing": 0.014359712600708008,
170
+ "ZeroSizeTensorElimination": 0.00021028518676757813,
171
+ "algsimp": 0.001816999982111156,
172
+ "batchnorm_expander": 3.5000000934815034e-05,
173
+ "boundary-marker-removal": 1.2999998943996616e-05,
174
+ "call-inliner": 0.00031099998159334064,
175
+ "canonicalize-boundary-marker": 1.5999999959603883e-05,
176
+ "collective-stream-id-checker": 7.60000039008446e-05,
177
+ "comparison-expander": 0.0004780000017490238,
178
+ "computation-deduplicator": 5.699999746866524e-05,
179
+ "config-lowering": 0.00012000000424450263,
180
+ "constant-statistics": 0.00038899999344721437,
181
+ "constant_folding": 0.00016199999663513154,
182
+ "cse": 3.5000000934815034e-05,
183
+ "dce": 4.3000000005122274e-05,
184
+ "dot_decomposer": 0.0010089999996125698,
185
+ "dynamic-slice-transpose": 1.2000000424450263e-05,
186
+ "eliminate-redundant-compare": 0.00013299999409355223,
187
+ "emit-offloaded-dropout": 3.7000001611886546e-05,
188
+ "flatten-call-graph": 0.0008110000053420663,
189
+ "fuse-send-recv": 6.600000051548705e-05,
190
+ "hilo-conditional-to-select": 1.2999999853491317e-05,
191
+ "hilo::LegalizeAlias": 1.1000000085914508e-05,
192
+ "hilo::NeuronInstCombine": 0.00019799999427050352,
193
+ "hilo::NeuronOpFusion": 3.7000001611886546e-05,
194
+ "hilo::ReplaceTokenTypeWithU8Pass": 4.8000001697801054e-05,
195
+ "hilo::ScheduleFusion": 3.999999989900971e-06,
196
+ "hilo::SixtyFourHack": 6.800000119255856e-05,
197
+ "hilo::VerifyAliasing": 3.999999989900971e-06,
198
+ "hlo-mac-count": 0.012529000639915466,
199
+ "instruction-histogram": 0.0008679999737069011,
200
+ "io-con-pipe-begin": 6.000000212225132e-06,
201
+ "io-con-pipe-end": 9.999999974752427e-07,
202
+ "io-layout-normalization": 0.0010789999505504966,
203
+ "io-statistics": 3.899999865097925e-05,
204
+ "legalize-ccops-for-tensorizer": 3.000000106112566e-06,
205
+ "legalize-compare": 1.1000000085914508e-05,
206
+ "lower-argminmax-custom-call": 9.999999747378752e-06,
207
+ "map-inline": 0.000813000020571053,
208
+ "metadata-naming": 4.900000203633681e-05,
209
+ "mlir::detail::OpToOpPassAdaptor": 7.60000039008446e-05,
210
+ "mlir::hlo::MhloToPyPenguin": 0.008621999993920326,
211
+ "mlir::mhlo::LowerComplexExtraPass": 0.00021299999207258224,
212
+ "mlir::mhlo::LowerComplexPass": 0.0003549999964889139,
213
+ "native-to-custom-softmax": 0.00033000000985339284,
214
+ "native-to-custom-softmax-dx": 0.0016530000139027834,
215
+ "neuron-hlo-verifier": 0.011901999823749065,
216
+ "operand_upcaster": 5.299999611452222e-05,
217
+ "opt-barrier-removal": 0.0003209999995306134,
218
+ "post-par-pipe-begin": 0.0003220000071451068,
219
+ "post-par-pipe-end": 0.0,
220
+ "post-partition-simplification": 0.0015040000434964895,
221
+ "pre-par-pipe-begin": 9.999999974752427e-07,
222
+ "pre-par-pipe-end": 0.0,
223
+ "pre-partition-simplification": 0.06566499918699265,
224
+ "replace-minimum-constant": 0.0003129999968223274,
225
+ "reshape-mover": 6.000000212225132e-05,
226
+ "simplify-concat": 0.00011900000390596688,
227
+ "simplify-while-loops": 5.900000178371556e-05,
228
+ "transform-variadic-reduce": 6.399999983841553e-05,
229
+ "tuple-simplifier": 0.00015100000018719584,
230
+ "unpack-nested-aws-ntwsr": 0.00023299999884329736,
231
+ "unroll-while-loop": 9.000000318337698e-06,
232
+ "zero_sized_hlo_elimination": 0.0007510000141337514
233
+ },
234
+ "hilo": {
235
+ "ConstantSize": 1843839.0,
236
+ "HloInputCount": 371.0,
237
+ "HloMacCount": 53843722240.0,
238
+ "HloOutputCount": 57.0,
239
+ "IfmapSize": 3910920192.0,
240
+ "OfmapSize": 1879048192.0,
241
+ "OutputsReadFromCount": 0.0,
242
+ "PassthroughTensorsCount": 0.0,
243
+ "RedundantOutputCount": 0.0,
244
+ "Traffic": 915302528.0
245
+ },
246
+ "tensorizer": {
247
+ "DMATilingProfiler::TotalInstructionsAfterTiling": 22664.0,
248
+ "StaticProfiler::AifUb": 229.36119079589844,
249
+ "StaticProfiler::ArithmeticIntensityTensorizer": 194.92408752441406,
250
+ "StaticProfiler::AverageDmaLength": 2258.685546875,
251
+ "StaticProfiler::DDRTransferBytes": 420482080.0,
252
+ "StaticProfiler::InternalTransferBytes": 338614048.0,
253
+ "StaticProfiler::LoadExpanded": 118366.0,
254
+ "StaticProfiler::StoreExpanded": 4458.0,
255
+ "StaticProfiler::TotalDMAExpanded": 122824.0,
256
+ "StaticProfiler::TotalDynamicInstancesCount": 27423.0,
257
+ "StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 26972.0,
258
+ "StaticProfiler::TotalLNCComm": 0.0,
259
+ "StaticProfiler::TotalLNCCommTransfer": 0.0,
260
+ "TilingProfiler::BatchnormInstructionsAfterTiling": 0.0,
261
+ "TilingProfiler::DmaInstructionsAfterTiling": 0.0,
262
+ "TilingProfiler::GenericInstructionsAfterTiling": 4.0,
263
+ "TilingProfiler::MatMultInstructionsAfterTiling": 11808.0,
264
+ "TilingProfiler::NumPfTransposes": 5.0,
265
+ "TilingProfiler::NumPfTransposesForIo": 1.0,
266
+ "TilingProfiler::NumPfTransposesForLocal": 1.0,
267
+ "TilingProfiler::NumPfTransposesForNonlocal": 3.0,
268
+ "TilingProfiler::PfTransposeInstructions": 9889.0,
269
+ "TilingProfiler::PfTransposeInstructionsForIo": 9504.0,
270
+ "TilingProfiler::PfTransposeInstructionsForLocal": 1.0,
271
+ "TilingProfiler::PfTransposeInstructionsForNonlocal": 384.0,
272
+ "TilingProfiler::ReduceInstructionsAfterTiling": 6.0,
273
+ "TilingProfiler::SimdInstructionsAfterTiling": 165.0,
274
+ "TilingProfiler::TotalInstructionsAfterTiling": 0.0,
275
+ "TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0,
276
+ "TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0,
277
+ "TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0,
278
+ "TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0,
279
+ "TransformConvOp::conv2d_column_packing": 0.0,
280
+ "TransformConvOp::conv2d_column_packing_1": 0.0,
281
+ "TransformConvOp::conv2d_column_packing_io10": 0.0,
282
+ "TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0
283
+ }
284
+ },
285
+ "all": {
286
+ "compiletime": {
287
+ "algsimp": 0.0016659999964758754,
288
+ "call-inliner": 0.0002859999949578196,
289
+ "collective-stream-id-checker": 6.600000051548705e-05,
290
+ "comparison-expander": 0.00045900000259280205,
291
+ "constant-statistics": 0.00038899999344721437,
292
+ "constant_folding": 0.00014000000373926014,
293
+ "dce": 3.9999998989515007e-05,
294
+ "dot_decomposer": 0.0010089999996125698,
295
+ "eliminate-redundant-compare": 0.00012399999832268804,
296
+ "flatten-call-graph": 0.0007849999819882214,
297
+ "hlo-mac-count": 0.007579999975860119,
298
+ "instruction-histogram": 0.0008679999737069011,
299
+ "io-con-pipe-begin": 6.000000212225132e-06,
300
+ "io-con-pipe-end": 9.999999974752427e-07,
301
+ "io-layout-normalization": 0.0010789999505504966,
302
+ "io-statistics": 3.899999865097925e-05,
303
+ "map-inline": 0.0007789999945089221,
304
+ "native-to-custom-softmax": 0.000311999989207834,
305
+ "native-to-custom-softmax-dx": 0.00039400000241585076,
306
+ "neuron-hlo-verifier": 0.01071999967098236,
307
+ "opt-barrier-removal": 0.0003209999995306134,
308
+ "pre-par-pipe-begin": 9.999999974752427e-07,
309
+ "pre-par-pipe-end": 0.0,
310
+ "pre-partition-simplification": 0.06566499918699265,
311
+ "replace-minimum-constant": 0.00029399999766610563,
312
+ "reshape-mover": 5.199999941396527e-05,
313
+ "simplify-while-loops": 5.2999999752501026e-05,
314
+ "tuple-simplifier": 0.00013800000306218863,
315
+ "unpack-nested-aws-ntwsr": 0.0002209999947808683,
316
+ "unroll-while-loop": 9.000000318337698e-06,
317
+ "zero_sized_hlo_elimination": 0.0007510000141337514
318
+ }
319
+ },
320
+ "attention_isa_kernel": {
321
+ "compiletime": {
322
+ "CoalesceCCOp": 0.00023293495178222656,
323
+ "DMALocalityOpt": 0.0001811981201171875,
324
+ "DMAProfiler": 0.00021409988403320313,
325
+ "DataStreaming": 0.00021123886108398438,
326
+ "DoNothing": 0.00015926361083984375,
327
+ "ExpandISAMacro": 0.00025653839111328125,
328
+ "FactorizeBlkDims": 0.0004589557647705078,
329
+ "InferPSumTensor": 0.001004934310913086,
330
+ "InferSharedMemLoc": 0.0005850791931152344,
331
+ "InsertCoreBarrier": 0.00032901763916015625,
332
+ "LateLegalizeInst": 0.000202178955078125,
333
+ "LateNeuronInstComb": 0.000457763671875,
334
+ "LegalizeSundaAccess": 0.000244140625,
335
+ "LegalizeType": 0.00035119056701660156,
336
+ "LowerBroadcast": 0.0002529621124267578,
337
+ "LowerIntrinsics": 0.00025534629821777344,
338
+ "LowerTranspose": 0.00019860267639160156,
339
+ "NeuronInstComb": 0.0004410743713378906,
340
+ "NeuronLICM": 0.00022935867309570313,
341
+ "NeuronSimplifyPredicates": 0.00023698806762695313,
342
+ "NeuronValueNumbering": 0.00019621849060058594,
343
+ "SFKVectorizer": 0.0017054080963134766,
344
+ "SimpleAllReduceTiling": 0.00020575523376464844,
345
+ "SimplifyNeuronTensor": 0.00058746337890625,
346
+ "SpillPSum": 0.0008275508880615234,
347
+ "WeightCoalescing": 0.0002827644348144531
348
+ }
349
+ },
350
+ "cumsum": {
351
+ "compiletime": {
352
+ "CoalesceCCOp": 0.0004239082336425781,
353
+ "DMALocalityOpt": 0.0008606910705566406,
354
+ "DMAProfiler": 0.0012273788452148438,
355
+ "DataStreaming": 0.0004677772521972656,
356
+ "DoNothing": 0.0020771026611328125,
357
+ "ExpandISAMacro": 0.0009121894836425781,
358
+ "FactorizeBlkDims": 0.0007412433624267578,
359
+ "InferPSumTensor": 0.0011811256408691406,
360
+ "InferSharedMemLoc": 0.00045990943908691406,
361
+ "InsertCoreBarrier": 0.00042891502380371094,
362
+ "LateLegalizeInst": 0.00063323974609375,
363
+ "LateNeuronInstComb": 0.0013093948364257813,
364
+ "LegalizeSundaAccess": 0.0025353431701660156,
365
+ "LegalizeType": 0.001573801040649414,
366
+ "LowerBroadcast": 0.0004336833953857422,
367
+ "LowerIntrinsics": 0.0003495216369628906,
368
+ "LowerTranspose": 0.00044226646423339844,
369
+ "NeuronInstComb": 0.007911205291748047,
370
+ "NeuronLICM": 0.0006246566772460938,
371
+ "NeuronSimplifyPredicates": 0.006840705871582031,
372
+ "NeuronValueNumbering": 0.0007255077362060547,
373
+ "SFKVectorizer": 0.008939266204833984,
374
+ "SimpleAllReduceTiling": 0.0003476142883300781,
375
+ "SimplifyNeuronTensor": 0.0009677410125732422,
376
+ "SpillPSum": 0.0031452178955078125,
377
+ "WeightCoalescing": 0.000408172607421875
378
+ }
379
+ },
380
+ "sg00": {
381
+ "compiletime": {
382
+ "CanonicalizeConv": 0.0,
383
+ "CanonicalizeForTensorizer": 1.4000000192027073e-05,
384
+ "Canonicalizer": 0.0002680000034160912,
385
+ "HoistCompute": 3.000000106112566e-06,
386
+ "IdentifyCrossPassTensors": 1.4000000192027073e-05,
387
+ "MemcastMotion": 9.000000318337698e-06,
388
+ "PenguinizeFunctions": 1.4999999621068127e-05,
389
+ "PruneFunctions": 1.2999999853491317e-05,
390
+ "RemoveOptimizationBarriers": 7.000000096013537e-06,
391
+ "ScatterMotion": 1.8000000636675395e-05,
392
+ "TensorizerLegalizationPass": 2.700000004551839e-05,
393
+ "VerifySupportedOps": 1.4000000192027073e-05,
394
+ "algsimp": 4.8000001697801054e-05,
395
+ "batchnorm_expander": 1.2999999853491317e-05,
396
+ "boundary-marker-removal": 3.999999989900971e-06,
397
+ "call-inliner": 7.000000096013537e-06,
398
+ "canonicalize-boundary-marker": 4.999999873689376e-06,
399
+ "collective-stream-id-checker": 3.999999989900971e-06,
400
+ "comparison-expander": 7.000000096013537e-06,
401
+ "computation-deduplicator": 1.700000029813964e-05,
402
+ "config-lowering": 3.899999865097925e-05,
403
+ "constant_folding": 7.000000096013537e-06,
404
+ "cse": 1.1000000085914508e-05,
405
+ "dce": 9.999999974752427e-07,
406
+ "dynamic-slice-transpose": 3.999999989900971e-06,
407
+ "eliminate-redundant-compare": 3.000000106112566e-06,
408
+ "emit-offloaded-dropout": 1.2000000424450263e-05,
409
+ "flatten-call-graph": 7.999999979801942e-06,
410
+ "fuse-send-recv": 2.8000000384054147e-05,
411
+ "hilo-conditional-to-select": 3.999999989900971e-06,
412
+ "hilo::LegalizeAlias": 4.999999873689376e-06,
413
+ "hilo::NeuronInstCombine": 7.79999973019585e-05,
414
+ "hilo::NeuronOpFusion": 1.4000000192027073e-05,
415
+ "hilo::ReplaceTokenTypeWithU8Pass": 1.8999999156221747e-05,
416
+ "hilo::ScheduleFusion": 9.999999974752427e-07,
417
+ "hilo::SixtyFourHack": 1.1000000085914508e-05,
418
+ "hilo::VerifyAliasing": 1.9999999949504854e-06,
419
+ "hlo-mac-count": 9.999999747378752e-05,
420
+ "legalize-ccops-for-tensorizer": 9.999999974752427e-07,
421
+ "legalize-compare": 3.999999989900971e-06,
422
+ "lower-argminmax-custom-call": 3.000000106112566e-06,
423
+ "map-inline": 1.2000000424450263e-05,
424
+ "metadata-naming": 1.4000000192027073e-05,
425
+ "mlir::detail::OpToOpPassAdaptor": 2.099999983329326e-05,
426
+ "mlir::hlo::MhloToPyPenguin": 0.001617999980226159,
427
+ "mlir::mhlo::LowerComplexExtraPass": 6.70000008540228e-05,
428
+ "mlir::mhlo::LowerComplexPass": 0.00011800000356743112,
429
+ "native-to-custom-softmax": 7.000000096013537e-06,
430
+ "native-to-custom-softmax-dx": 0.001218999968841672,
431
+ "neuron-hlo-verifier": 0.0004619999963324517,
432
+ "operand_upcaster": 2.099999983329326e-05,
433
+ "post-par-pipe-begin": 0.00031800000579096377,
434
+ "post-par-pipe-end": 0.0,
435
+ "post-partition-simplification": 0.00047400000039488077,
436
+ "replace-minimum-constant": 6.000000212225132e-06,
437
+ "reshape-mover": 3.000000106112566e-06,
438
+ "simplify-concat": 3.7000001611886546e-05,
439
+ "simplify-while-loops": 1.9999999949504854e-06,
440
+ "transform-variadic-reduce": 7.999999979801942e-06,
441
+ "tuple-simplifier": 3.999999989900971e-06,
442
+ "unpack-nested-aws-ntwsr": 3.999999989900971e-06,
443
+ "unroll-while-loop": 0.0
444
+ },
445
+ "hilo": {
446
+ "ArithmeticIntensity": 36.6374397277832,
447
+ "ConstantSize": 1843839.0,
448
+ "HloInputCount": 371.0,
449
+ "HloMacCount": 7516192768.0,
450
+ "HloOutputCount": 57.0,
451
+ "IfmapSize": 3910920192.0,
452
+ "OfmapSize": 1879048192.0,
453
+ "OutputsReadFromCount": 0.0,
454
+ "PassthroughTensorsCount": 0.0,
455
+ "RedundantOutputCount": 0.0,
456
+ "Traffic": 410301216.0
457
+ }
458
+ },
459
+ "sg0000": {
460
+ "compiletime": {
461
+ "AGOrderingAnalysisPass": 0.10170578956604004,
462
+ "AffinePredicateResolution": 0.002114534378051758,
463
+ "AliasDependencyElimination": 0.0003135204315185547,
464
+ "AliasDependencyInduction": 0.008873462677001953,
465
+ "AliasDependencyReset": 0.08848953247070313,
466
+ "BFComputeCutting": 0.0046901702880859375,
467
+ "BirCodeGenLoop": 0.07164216041564941,
468
+ "CCOpFusion": 0.03796195983886719,
469
+ "CanonicalizeDAGForPGTiling": 0.004980564117431641,
470
+ "CanonicalizeIR": 0.0069043636322021484,
471
+ "CoalesceCCOp": 0.025182723999023438,
472
+ "CommuteConcat": 0.0019867420196533203,
473
+ "DMALocalityOpt": 0.0017561912536621094,
474
+ "DMAProfiler": 0.015140295028686523,
475
+ "DMATilingProfiler": 0.016626596450805664,
476
+ "DataLocalityOpt": 0.22760343551635742,
477
+ "DataStreaming": 0.010300159454345703,
478
+ "DeConcat": 0.0027208328247070313,
479
+ "DeadCodeElimination": 0.0024912357330322266,
480
+ "DeadStoreElimination": 0.0712437629699707,
481
+ "DelinearIndices": 0.016620635986328125,
482
+ "Delinearization": 0.009757280349731445,
483
+ "DelinearizeSPMD": 0.031106233596801758,
484
+ "DoNothing": 0.00010442733764648438,
485
+ "DramToDramTranspose": 0.015790462493896484,
486
+ "DumpGraphAndMetadata": 0.009348392486572266,
487
+ "EliminateDivs": 0.0055081844329833984,
488
+ "ExpandBatchNorm": 0.002715587615966797,
489
+ "ExpandISAMacro": 0.006904125213623047,
490
+ "FactorizeBlkDims": 0.02294635772705078,
491
+ "FactorizeThreadAxesInFreeDims": 0.004876136779785156,
492
+ "FlattenMacroLoop": 0.014545440673828125,
493
+ "GenericAccessSimplifier": 0.0014882087707519531,
494
+ "InferInitValue": 0.07265543937683105,
495
+ "InferIntrinsicOnCC": 0.016221046447753906,
496
+ "InferNeuronTensor": 0.06634330749511719,
497
+ "InferNonlocalTensors": 0.310718297958374,
498
+ "InferPSumTensor": 0.1104276180267334,
499
+ "InferShardAxis": 0.6379494667053223,
500
+ "InferSharedMemLoc": 0.007468461990356445,
501
+ "InlineNativeKernels": 0.008686304092407227,
502
+ "InsertCoreBarrier": 0.013060331344604492,
503
+ "InsertIOTransposes": 0.0500941276550293,
504
+ "InsertImplicitShardAxisBeforeISel": 0.013952255249023438,
505
+ "InsertLocalTransposes": 0.011726140975952148,
506
+ "InsertOffloadedTransposes": 0.015027046203613281,
507
+ "LICM": 0.009333610534667969,
508
+ "LateLegalizeInst": 0.02084517478942871,
509
+ "LateLegalizePostSplit": 0.006055116653442383,
510
+ "LateLowerReshapeOp": 0.0010623931884765625,
511
+ "LateLowerTensorOp": 0.005917787551879883,
512
+ "LateNeuronInstComb": 0.0374608039855957,
513
+ "LayoutPreprocessing": 0.11253118515014648,
514
+ "LayoutPreprocessingAndAnalysis": 0.17174959182739258,
515
+ "LayoutRequirementAnalysis": 0.01859116554260254,
516
+ "LegalizeCCOpLayout": 0.008987903594970703,
517
+ "LegalizeOpLevelAlias": 0.0018634796142578125,
518
+ "LegalizePartitionReduce": 0.0028128623962402344,
519
+ "LegalizeSundaAccess": 0.0760490894317627,
520
+ "LegalizeSundaMacro": 0.04249215126037598,
521
+ "LegalizeType": 0.017363786697387695,
522
+ "LocalLayoutOpt": 0.030303478240966797,
523
+ "LoopFusion": 0.015121221542358398,
524
+ "LoopSplitting": 0.001684427261352539,
525
+ "LowerBroadcast": 0.004286289215087891,
526
+ "LowerCCOpBlockAxis": 0.011670112609863281,
527
+ "LowerComplexBroadcast": 0.009485006332397461,
528
+ "LowerIntrinsics": 0.06814241409301758,
529
+ "LowerShardAxis": 0.01289224624633789,
530
+ "LowerTensorOp": 0.012324810028076172,
531
+ "LowerToSendRecv": 0.01944112777709961,
532
+ "LowerTranspose": 0.024444580078125,
533
+ "MacroGeneration": 0.12030863761901855,
534
+ "MaskPropagation": 0.0041234493255615234,
535
+ "MemcpyElimination": 0.11655545234680176,
536
+ "MutateDataType": 0.006365299224853516,
537
+ "NeuronAliasDependencyInduction": 0.0008358955383300781,
538
+ "NeuronAliasDependencyReset": 0.0208890438079834,
539
+ "NeuronInstComb": 0.012987852096557617,
540
+ "NeuronLICM": 0.03186321258544922,
541
+ "NeuronLoopFusion": 0.039856910705566406,
542
+ "NeuronLoopInterchange": 0.0034656524658203125,
543
+ "NeuronSimplifier": 0.04315042495727539,
544
+ "NeuronSimplifyPredicates": 0.005248546600341797,
545
+ "NeuronValueNumbering": 0.017512798309326172,
546
+ "OptimizeAliasedCopyChain": 0.0023038387298583984,
547
+ "OptimizeNKIKernels": 0.3315870761871338,
548
+ "PAGLayoutOpt": 0.6959309577941895,
549
+ "PComputeCutting": 0.02900981903076172,
550
+ "PGLayoutTilingPipeline": 2.8589253425598145,
551
+ "PGTiling": 0.4929697513580322,
552
+ "PadElimination": 0.0008306503295898438,
553
+ "ParAxesAnnotation": 0.6449503898620605,
554
+ "PartialLoopFusion": 0.04073286056518555,
555
+ "PartialSimdFusion": 0.04506206512451172,
556
+ "PerfectLoopNest": 0.003442049026489258,
557
+ "RecognizeOpIdiom": 0.01386570930480957,
558
+ "Recompute": 0.0005090236663818359,
559
+ "RelaxPredicates": 0.007751941680908203,
560
+ "Rematerialization": 0.0035130977630615234,
561
+ "RemoveShardedPartitionAxes": 0.042932987213134766,
562
+ "ReshapeWeights": 0.005467653274536133,
563
+ "ResolveAccessConflict": 0.007354259490966797,
564
+ "ResolveComplicatePredicates": 0.0022590160369873047,
565
+ "RewriteReplicationMatmul": 0.0024857521057128906,
566
+ "RewriteWeights": 0.007905960083007813,
567
+ "SFKVectorizer": 0.45865941047668457,
568
+ "ShardingPropagationAnalysis": 0.015976905822753906,
569
+ "SimpleAllReduceTiling": 0.004487752914428711,
570
+ "Simplifier": 0.01264333724975586,
571
+ "SimplifyMacroPredicates": 0.010998964309692383,
572
+ "SimplifyNeuronTensor": 0.020704269409179688,
573
+ "SimplifySlice": 0.0029506683349609375,
574
+ "SimplifyTensor": 0.024234533309936523,
575
+ "SpillPSum": 0.03745222091674805,
576
+ "SplitAPUnionSets": 0.0402374267578125,
577
+ "SplitAccGrp": 0.0030994415283203125,
578
+ "StaticProfiler": 0.007781982421875,
579
+ "StaticTransposeLocalTensor": 0.015400409698486328,
580
+ "SundaISel": 0.15909790992736816,
581
+ "TCTransform": 0.0024313926696777344,
582
+ "TensorInitialization": 0.00689244270324707,
583
+ "TensorOpSimplifier": 0.009465932846069336,
584
+ "TensorOpTransform": 0.05043935775756836,
585
+ "TileCCOps": 0.01146245002746582,
586
+ "TilingProfiler": 0.030185699462890625,
587
+ "TransformConvOp": 0.003003835678100586,
588
+ "TritiumFusion": 0.07740235328674316,
589
+ "ValueNumbering": 0.006630659103393555,
590
+ "VectorizeDMA": 0.006995201110839844,
591
+ "VectorizeMatMult": 0.019536495208740234,
592
+ "WeightCoalescing": 0.007775783538818359,
593
+ "ZeroSizeTensorElimination": 0.0001773834228515625
594
+ },
595
+ "tensorizer": {
596
+ "DMATilingProfiler::TotalInstructionsAfterTiling": 9885.0,
597
+ "StaticProfiler::AifUb": 33.7130126953125,
598
+ "StaticProfiler::ArithmeticIntensityTensorizer": 285.20709228515625,
599
+ "StaticProfiler::AverageDmaLength": 1479.2880859375,
600
+ "StaticProfiler::AverageFractalPeUtilization": 99.77941131591797,
601
+ "StaticProfiler::AveragePartitionUtilization": 99.22618865966797,
602
+ "StaticProfiler::AveragePeUtilization": 99.2345962524414,
603
+ "StaticProfiler::DDRTransferBytes": 55208456.0,
604
+ "StaticProfiler::InternalTransferBytes": 47980544.0,
605
+ "StaticProfiler::LoadExpanded": 15885.0,
606
+ "StaticProfiler::LocalizationEfficiency": 845.9852294921875,
607
+ "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 1554.64208984375,
608
+ "StaticProfiler::StoreExpanded": 10241.0,
609
+ "StaticProfiler::TotalDMAExpanded": 26126.0,
610
+ "StaticProfiler::TotalDynamicInstancesCount": 2424.0,
611
+ "StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 2417.0,
612
+ "StaticProfiler::TotalLNCComm": 0.0,
613
+ "StaticProfiler::TotalLNCCommTransfer": 0.0,
614
+ "TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0,
615
+ "TilingProfiler::AveragePeUtilizationAfterTiling": 0.0,
616
+ "TilingProfiler::BatchnormInstructionsAfterTiling": 0.0,
617
+ "TilingProfiler::DmaInstructionsAfterTiling": 0.0,
618
+ "TilingProfiler::GenericInstructionsAfterTiling": 80.0,
619
+ "TilingProfiler::MatMultInstructionsAfterTiling": 776.0,
620
+ "TilingProfiler::NumPfTransposes": 7.0,
621
+ "TilingProfiler::NumPfTransposesForIo": 0.0,
622
+ "TilingProfiler::NumPfTransposesForLocal": 3.0,
623
+ "TilingProfiler::NumPfTransposesForNonlocal": 4.0,
624
+ "TilingProfiler::PfTransposeInstructions": 448.0,
625
+ "TilingProfiler::PfTransposeInstructionsForIo": 0.0,
626
+ "TilingProfiler::PfTransposeInstructionsForLocal": 128.0,
627
+ "TilingProfiler::PfTransposeInstructionsForNonlocal": 320.0,
628
+ "TilingProfiler::ReduceInstructionsAfterTiling": 0.0,
629
+ "TilingProfiler::SimdInstructionsAfterTiling": 236.0,
630
+ "TilingProfiler::TotalInstructionsAfterTiling": 0.0,
631
+ "TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0,
632
+ "TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0,
633
+ "TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0,
634
+ "TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0,
635
+ "TransformConvOp::conv2d_column_packing": 0.0,
636
+ "TransformConvOp::conv2d_column_packing_1": 0.0,
637
+ "TransformConvOp::conv2d_column_packing_io10": 0.0,
638
+ "TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0
639
+ }
640
+ },
641
+ "sg0001": {
642
+ "compiletime": {
643
+ "AGOrderingAnalysisPass": 0.11948776245117188,
644
+ "AffinePredicateResolution": 0.0018799304962158203,
645
+ "AliasDependencyElimination": 0.00021576881408691406,
646
+ "AliasDependencyInduction": 0.007300615310668945,
647
+ "AliasDependencyReset": 0.025965213775634766,
648
+ "BFComputeCutting": 0.0029859542846679688,
649
+ "BirCodeGenLoop": 0.0455019474029541,
650
+ "CCOpFusion": 0.04734611511230469,
651
+ "CanonicalizeDAGForPGTiling": 0.022237777709960938,
652
+ "CanonicalizeIR": 0.002727985382080078,
653
+ "CoalesceCCOp": 0.02167034149169922,
654
+ "CommuteConcat": 0.003200054168701172,
655
+ "DMALocalityOpt": 0.00392460823059082,
656
+ "DMAProfiler": 0.009830236434936523,
657
+ "DMATilingProfiler": 0.025944948196411133,
658
+ "DataLocalityOpt": 0.3604612350463867,
659
+ "DataStreaming": 0.009065628051757813,
660
+ "DeConcat": 0.0069577693939208984,
661
+ "DeadCodeElimination": 0.011698722839355469,
662
+ "DeadStoreElimination": 0.06011176109313965,
663
+ "DelinearIndices": 0.020532608032226563,
664
+ "Delinearization": 0.00762939453125,
665
+ "DelinearizeSPMD": 0.03405618667602539,
666
+ "DoNothing": 8.106231689453125e-05,
667
+ "DramToDramTranspose": 0.01855611801147461,
668
+ "DumpGraphAndMetadata": 0.008964061737060547,
669
+ "EliminateDivs": 0.0031299591064453125,
670
+ "ExpandBatchNorm": 0.0030705928802490234,
671
+ "ExpandISAMacro": 0.006265163421630859,
672
+ "FactorizeBlkDims": 0.03638315200805664,
673
+ "FactorizeThreadAxesInFreeDims": 0.008359670639038086,
674
+ "FlattenMacroLoop": 0.012061595916748047,
675
+ "GenericAccessSimplifier": 0.0030562877655029297,
676
+ "InferInitValue": 0.08994674682617188,
677
+ "InferIntrinsicOnCC": 0.024573802947998047,
678
+ "InferNeuronTensor": 0.1031036376953125,
679
+ "InferNonlocalTensors": 0.05871725082397461,
680
+ "InferPSumTensor": 0.06618380546569824,
681
+ "InferShardAxis": 0.7525274753570557,
682
+ "InferSharedMemLoc": 0.0068051815032958984,
683
+ "InlineNativeKernels": 0.005843400955200195,
684
+ "InsertCoreBarrier": 0.008070230484008789,
685
+ "InsertIOTransposes": 0.04006528854370117,
686
+ "InsertImplicitShardAxisBeforeISel": 0.01073002815246582,
687
+ "InsertLocalTransposes": 0.014261007308959961,
688
+ "InsertOffloadedTransposes": 0.03949117660522461,
689
+ "LICM": 0.009208917617797852,
690
+ "LateLegalizeInst": 0.029766082763671875,
691
+ "LateLegalizePostSplit": 0.005662679672241211,
692
+ "LateLowerReshapeOp": 0.0074732303619384766,
693
+ "LateLowerTensorOp": 0.003675222396850586,
694
+ "LateNeuronInstComb": 0.010900020599365234,
695
+ "LayoutPreprocessing": 0.12459802627563477,
696
+ "LayoutPreprocessingAndAnalysis": 0.2370927333831787,
697
+ "LayoutRequirementAnalysis": 0.02673649787902832,
698
+ "LegalizeCCOpLayout": 0.001771688461303711,
699
+ "LegalizeOpLevelAlias": 0.001964569091796875,
700
+ "LegalizePartitionReduce": 0.0026857852935791016,
701
+ "LegalizeSundaAccess": 0.024449825286865234,
702
+ "LegalizeSundaMacro": 0.031160593032836914,
703
+ "LegalizeType": 0.01265263557434082,
704
+ "LocalLayoutOpt": 0.13158392906188965,
705
+ "LoopFusion": 0.008500337600708008,
706
+ "LoopSplitting": 0.007683753967285156,
707
+ "LowerBroadcast": 0.0029337406158447266,
708
+ "LowerCCOpBlockAxis": 0.019019126892089844,
709
+ "LowerComplexBroadcast": 0.0050733089447021484,
710
+ "LowerIntrinsics": 0.045258283615112305,
711
+ "LowerShardAxis": 0.010171175003051758,
712
+ "LowerTensorOp": 0.04014849662780762,
713
+ "LowerToSendRecv": 0.006317615509033203,
714
+ "LowerTranspose": 0.02257823944091797,
715
+ "MacroGeneration": 0.1289076805114746,
716
+ "MaskPropagation": 0.007184505462646484,
717
+ "MemcpyElimination": 0.13024330139160156,
718
+ "MutateDataType": 0.0023887157440185547,
719
+ "NeuronAliasDependencyInduction": 0.0008273124694824219,
720
+ "NeuronAliasDependencyReset": 0.023006439208984375,
721
+ "NeuronInstComb": 0.02357006072998047,
722
+ "NeuronLICM": 0.016632556915283203,
723
+ "NeuronLoopFusion": 0.05176591873168945,
724
+ "NeuronLoopInterchange": 0.003633737564086914,
725
+ "NeuronSimplifier": 0.055544376373291016,
726
+ "NeuronSimplifyPredicates": 0.0042285919189453125,
727
+ "NeuronValueNumbering": 0.007681369781494141,
728
+ "OptimizeAliasedCopyChain": 0.0018992424011230469,
729
+ "OptimizeNKIKernels": 0.42712831497192383,
730
+ "PAGLayoutOpt": 0.40447092056274414,
731
+ "PComputeCutting": 0.02052617073059082,
732
+ "PGLayoutTilingPipeline": 2.5240347385406494,
733
+ "PGTiling": 0.4373018741607666,
734
+ "PadElimination": 0.0004992485046386719,
735
+ "ParAxesAnnotation": 0.3364219665527344,
736
+ "PartialLoopFusion": 0.04578566551208496,
737
+ "PartialSimdFusion": 0.07974457740783691,
738
+ "PerfectLoopNest": 0.006705045700073242,
739
+ "RecognizeOpIdiom": 0.007408857345581055,
740
+ "Recompute": 0.0003921985626220703,
741
+ "RelaxPredicates": 0.004956483840942383,
742
+ "Rematerialization": 0.00407719612121582,
743
+ "RemoveShardedPartitionAxes": 0.03296494483947754,
744
+ "ReshapeWeights": 0.0016734600067138672,
745
+ "ResolveAccessConflict": 0.005868196487426758,
746
+ "ResolveComplicatePredicates": 0.0019488334655761719,
747
+ "RewriteReplicationMatmul": 0.002888917922973633,
748
+ "RewriteWeights": 0.0121307373046875,
749
+ "SFKVectorizer": 0.3227095603942871,
750
+ "ShardingPropagationAnalysis": 0.030770540237426758,
751
+ "SimpleAllReduceTiling": 0.005700588226318359,
752
+ "Simplifier": 0.006751298904418945,
753
+ "SimplifyMacroPredicates": 0.0224151611328125,
754
+ "SimplifyNeuronTensor": 0.026612043380737305,
755
+ "SimplifySlice": 0.0016014575958251953,
756
+ "SimplifyTensor": 0.014640331268310547,
757
+ "SpillPSum": 0.03543543815612793,
758
+ "SplitAPUnionSets": 0.04225468635559082,
759
+ "SplitAccGrp": 0.0025916099548339844,
760
+ "StaticProfiler": 0.004286527633666992,
761
+ "StaticTransposeLocalTensor": 0.01450037956237793,
762
+ "SundaISel": 0.09066033363342285,
763
+ "TCTransform": 0.001735687255859375,
764
+ "TensorInitialization": 0.005040168762207031,
765
+ "TensorOpSimplifier": 0.009763479232788086,
766
+ "TensorOpTransform": 0.037050485610961914,
767
+ "TileCCOps": 0.007235288619995117,
768
+ "TilingProfiler": 0.022336721420288086,
769
+ "TransformConvOp": 0.003210783004760742,
770
+ "TritiumFusion": 0.1834256649017334,
771
+ "ValueNumbering": 0.007995128631591797,
772
+ "VectorizeDMA": 0.009528160095214844,
773
+ "VectorizeMatMult": 0.04178977012634277,
774
+ "WeightCoalescing": 0.0037496089935302734,
775
+ "ZeroSizeTensorElimination": 0.00022602081298828125
776
+ },
777
+ "tensorizer": {
778
+ "DMATilingProfiler::TotalInstructionsAfterTiling": 12395.0,
779
+ "StaticProfiler::AifUb": 272.9356689453125,
780
+ "StaticProfiler::ArithmeticIntensityTensorizer": 394.9350280761719,
781
+ "StaticProfiler::AverageDmaLength": 1993.7806396484375,
782
+ "StaticProfiler::AverageFractalPeUtilization": 100.0,
783
+ "StaticProfiler::AveragePartitionUtilization": 99.59767150878906,
784
+ "StaticProfiler::AveragePeUtilization": 100.0,
785
+ "StaticProfiler::DDRTransferBytes": 139593728.0,
786
+ "StaticProfiler::InternalTransferBytes": 38535168.0,
787
+ "StaticProfiler::LoadExpanded": 49793.0,
788
+ "StaticProfiler::LocalizationEfficiency": 144.69894409179688,
789
+ "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 179.81776428222656,
790
+ "StaticProfiler::StoreExpanded": 11265.0,
791
+ "StaticProfiler::TotalDMAExpanded": 61058.0,
792
+ "StaticProfiler::TotalDynamicInstancesCount": 4975.0,
793
+ "StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 4975.0,
794
+ "StaticProfiler::TotalLNCComm": 0.0,
795
+ "StaticProfiler::TotalLNCCommTransfer": 0.0,
796
+ "TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0,
797
+ "TilingProfiler::AveragePeUtilizationAfterTiling": 0.0,
798
+ "TilingProfiler::BatchnormInstructionsAfterTiling": 0.0,
799
+ "TilingProfiler::DmaInstructionsAfterTiling": 0.0,
800
+ "TilingProfiler::GenericInstructionsAfterTiling": 64.0,
801
+ "TilingProfiler::MatMultInstructionsAfterTiling": 3072.0,
802
+ "TilingProfiler::NumPfTransposes": 8.0,
803
+ "TilingProfiler::NumPfTransposesForIo": 3.0,
804
+ "TilingProfiler::NumPfTransposesForLocal": 3.0,
805
+ "TilingProfiler::NumPfTransposesForNonlocal": 2.0,
806
+ "TilingProfiler::PfTransposeInstructions": 496.0,
807
+ "TilingProfiler::PfTransposeInstructionsForIo": 144.0,
808
+ "TilingProfiler::PfTransposeInstructionsForLocal": 96.0,
809
+ "TilingProfiler::PfTransposeInstructionsForNonlocal": 256.0,
810
+ "TilingProfiler::ReduceInstructionsAfterTiling": 0.0,
811
+ "TilingProfiler::SimdInstructionsAfterTiling": 275.0,
812
+ "TilingProfiler::TotalInstructionsAfterTiling": 0.0,
813
+ "TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0,
814
+ "TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0,
815
+ "TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0,
816
+ "TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0,
817
+ "TransformConvOp::conv2d_column_packing": 0.0,
818
+ "TransformConvOp::conv2d_column_packing_1": 0.0,
819
+ "TransformConvOp::conv2d_column_packing_io10": 0.0,
820
+ "TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0
821
+ }
822
+ },
823
+ "sg0002": {
824
+ "compiletime": {
825
+ "AGOrderingAnalysisPass": 0.057534217834472656,
826
+ "AffinePredicateResolution": 0.0009605884552001953,
827
+ "AliasDependencyElimination": 0.00025153160095214844,
828
+ "AliasDependencyInduction": 0.006276607513427734,
829
+ "AliasDependencyReset": 0.027743816375732422,
830
+ "BFComputeCutting": 0.0031321048736572266,
831
+ "BirCodeGenLoop": 0.5169932842254639,
832
+ "CCOpFusion": 0.05496716499328613,
833
+ "CanonicalizeDAGForPGTiling": 0.010706663131713867,
834
+ "CanonicalizeIR": 0.00154876708984375,
835
+ "CoalesceCCOp": 0.020469188690185547,
836
+ "CommuteConcat": 0.001708984375,
837
+ "DMALocalityOpt": 0.0024063587188720703,
838
+ "DMAProfiler": 0.021881103515625,
839
+ "DMATilingProfiler": 0.011522531509399414,
840
+ "DataLocalityOpt": 0.28015780448913574,
841
+ "DataStreaming": 0.018134355545043945,
842
+ "DeConcat": 0.002462148666381836,
843
+ "DeadCodeElimination": 0.0021996498107910156,
844
+ "DeadStoreElimination": 0.007483243942260742,
845
+ "DelinearIndices": 0.008810281753540039,
846
+ "Delinearization": 0.009731292724609375,
847
+ "DelinearizeSPMD": 0.04425859451293945,
848
+ "DoNothing": 6.67572021484375e-05,
849
+ "DramToDramTranspose": 0.012907743453979492,
850
+ "DumpGraphAndMetadata": 0.07597684860229492,
851
+ "EliminateDivs": 0.0021903514862060547,
852
+ "ExpandBatchNorm": 0.001527547836303711,
853
+ "ExpandISAMacro": 0.015442609786987305,
854
+ "FactorizeBlkDims": 0.020684003829956055,
855
+ "FactorizeThreadAxesInFreeDims": 0.003031015396118164,
856
+ "FlattenMacroLoop": 0.004990577697753906,
857
+ "GenericAccessSimplifier": 0.0007598400115966797,
858
+ "InferInitValue": 0.10130023956298828,
859
+ "InferIntrinsicOnCC": 0.007919549942016602,
860
+ "InferNeuronTensor": 0.05837249755859375,
861
+ "InferNonlocalTensors": 0.05706453323364258,
862
+ "InferPSumTensor": 0.04483771324157715,
863
+ "InferShardAxis": 0.4604020118713379,
864
+ "InferSharedMemLoc": 0.04048299789428711,
865
+ "InlineNativeKernels": 0.006569623947143555,
866
+ "InsertCoreBarrier": 0.010969161987304688,
867
+ "InsertIOTransposes": 0.0684211254119873,
868
+ "InsertImplicitShardAxisBeforeISel": 0.01549673080444336,
869
+ "InsertLocalTransposes": 0.022176742553710938,
870
+ "InsertOffloadedTransposes": 0.0181121826171875,
871
+ "LICM": 0.007555484771728516,
872
+ "LateLegalizeInst": 0.013030767440795898,
873
+ "LateLegalizePostSplit": 0.01993083953857422,
874
+ "LateLowerReshapeOp": 0.0016782283782958984,
875
+ "LateLowerTensorOp": 0.0021178722381591797,
876
+ "LateNeuronInstComb": 0.03255581855773926,
877
+ "LayoutPreprocessing": 0.10170960426330566,
878
+ "LayoutPreprocessingAndAnalysis": 0.23344039916992188,
879
+ "LayoutRequirementAnalysis": 0.032952308654785156,
880
+ "LegalizeCCOpLayout": 0.002583742141723633,
881
+ "LegalizeOpLevelAlias": 0.002170562744140625,
882
+ "LegalizePartitionReduce": 0.0025551319122314453,
883
+ "LegalizeSundaAccess": 0.08088016510009766,
884
+ "LegalizeSundaMacro": 0.04086017608642578,
885
+ "LegalizeType": 0.009904623031616211,
886
+ "LocalLayoutOpt": 0.023218154907226563,
887
+ "LoopFusion": 0.005990266799926758,
888
+ "LoopSplitting": 0.0007989406585693359,
889
+ "LowerBroadcast": 0.0051610469818115234,
890
+ "LowerCCOpBlockAxis": 0.007201671600341797,
891
+ "LowerComplexBroadcast": 0.00890207290649414,
892
+ "LowerIntrinsics": 0.09793353080749512,
893
+ "LowerShardAxis": 0.023633956909179688,
894
+ "LowerTensorOp": 0.03027796745300293,
895
+ "LowerToSendRecv": 0.027859210968017578,
896
+ "LowerTranspose": 0.0216217041015625,
897
+ "MacroGeneration": 0.12761783599853516,
898
+ "MaskPropagation": 0.01400303840637207,
899
+ "MemcpyElimination": 0.03596854209899902,
900
+ "MutateDataType": 0.0020971298217773438,
901
+ "NeuronAliasDependencyInduction": 0.0019202232360839844,
902
+ "NeuronAliasDependencyReset": 0.027405738830566406,
903
+ "NeuronInstComb": 0.024044275283813477,
904
+ "NeuronLICM": 0.027622222900390625,
905
+ "NeuronLoopFusion": 0.06255030632019043,
906
+ "NeuronLoopInterchange": 0.002681255340576172,
907
+ "NeuronSimplifier": 0.01907205581665039,
908
+ "NeuronSimplifyPredicates": 0.029021024703979492,
909
+ "NeuronValueNumbering": 0.011119604110717773,
910
+ "OptimizeAliasedCopyChain": 0.0005273818969726563,
911
+ "OptimizeNKIKernels": 4.391921043395996,
912
+ "PAGLayoutOpt": 0.16190624237060547,
913
+ "PComputeCutting": 0.016373872756958008,
914
+ "PGLayoutTilingPipeline": 2.0541465282440186,
915
+ "PGTiling": 0.3632845878601074,
916
+ "PadElimination": 0.0006501674652099609,
917
+ "ParAxesAnnotation": 0.08851456642150879,
918
+ "PartialLoopFusion": 0.05034661293029785,
919
+ "PartialSimdFusion": 0.014182329177856445,
920
+ "PerfectLoopNest": 0.0036270618438720703,
921
+ "RecognizeOpIdiom": 0.007064342498779297,
922
+ "Recompute": 0.00046062469482421875,
923
+ "RelaxPredicates": 0.02269601821899414,
924
+ "Rematerialization": 0.0019779205322265625,
925
+ "RemoveShardedPartitionAxes": 0.014830350875854492,
926
+ "ReshapeWeights": 0.0021474361419677734,
927
+ "ResolveAccessConflict": 0.007428646087646484,
928
+ "ResolveComplicatePredicates": 0.001834869384765625,
929
+ "RewriteReplicationMatmul": 0.006201982498168945,
930
+ "RewriteWeights": 0.004793643951416016,
931
+ "SFKVectorizer": 0.2884867191314697,
932
+ "ShardingPropagationAnalysis": 0.2801475524902344,
933
+ "SimpleAllReduceTiling": 0.008132696151733398,
934
+ "Simplifier": 0.003251314163208008,
935
+ "SimplifyMacroPredicates": 0.03280019760131836,
936
+ "SimplifyNeuronTensor": 0.04464459419250488,
937
+ "SimplifySlice": 0.0008628368377685547,
938
+ "SimplifyTensor": 0.014911413192749023,
939
+ "SpillPSum": 0.03145956993103027,
940
+ "SplitAPUnionSets": 0.09714126586914063,
941
+ "SplitAccGrp": 0.006166219711303711,
942
+ "StaticProfiler": 0.021403789520263672,
943
+ "StaticTransposeLocalTensor": 0.02319931983947754,
944
+ "SundaISel": 0.07143282890319824,
945
+ "TCTransform": 0.001344442367553711,
946
+ "TensorInitialization": 0.020877599716186523,
947
+ "TensorOpSimplifier": 0.0060787200927734375,
948
+ "TensorOpTransform": 0.03784608840942383,
949
+ "TileCCOps": 0.005100250244140625,
950
+ "TilingProfiler": 0.02941441535949707,
951
+ "TransformConvOp": 0.005896091461181641,
952
+ "TritiumFusion": 0.08978962898254395,
953
+ "ValueNumbering": 0.0032432079315185547,
954
+ "VectorizeDMA": 0.005987644195556641,
955
+ "VectorizeMatMult": 0.019278526306152344,
956
+ "WeightCoalescing": 0.004654884338378906,
957
+ "ZeroSizeTensorElimination": 0.00021028518676757813
958
+ },
959
+ "tensorizer": {
960
+ "DMATilingProfiler::TotalInstructionsAfterTiling": 22664.0,
961
+ "StaticProfiler::AifUb": 229.36119079589844,
962
+ "StaticProfiler::ArithmeticIntensityTensorizer": 194.92408752441406,
963
+ "StaticProfiler::AverageDmaLength": 2258.685546875,
964
+ "StaticProfiler::AverageFractalPeUtilization": 98.80319213867188,
965
+ "StaticProfiler::AveragePartitionUtilization": 94.51075744628906,
966
+ "StaticProfiler::AveragePeUtilization": 96.83863067626953,
967
+ "StaticProfiler::DDRTransferBytes": 420482080.0,
968
+ "StaticProfiler::InternalTransferBytes": 338614048.0,
969
+ "StaticProfiler::LoadExpanded": 118366.0,
970
+ "StaticProfiler::LocalizationEfficiency": 84.98564147949219,
971
+ "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 89.59233093261719,
972
+ "StaticProfiler::StoreExpanded": 4458.0,
973
+ "StaticProfiler::TotalDMAExpanded": 122824.0,
974
+ "StaticProfiler::TotalDynamicInstancesCount": 27423.0,
975
+ "StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 26972.0,
976
+ "StaticProfiler::TotalLNCComm": 0.0,
977
+ "StaticProfiler::TotalLNCCommTransfer": 0.0,
978
+ "TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0,
979
+ "TilingProfiler::AveragePeUtilizationAfterTiling": 0.0,
980
+ "TilingProfiler::BatchnormInstructionsAfterTiling": 0.0,
981
+ "TilingProfiler::DmaInstructionsAfterTiling": 0.0,
982
+ "TilingProfiler::GenericInstructionsAfterTiling": 4.0,
983
+ "TilingProfiler::MatMultInstructionsAfterTiling": 11808.0,
984
+ "TilingProfiler::NumPfTransposes": 5.0,
985
+ "TilingProfiler::NumPfTransposesForIo": 1.0,
986
+ "TilingProfiler::NumPfTransposesForLocal": 1.0,
987
+ "TilingProfiler::NumPfTransposesForNonlocal": 3.0,
988
+ "TilingProfiler::PfTransposeInstructions": 9889.0,
989
+ "TilingProfiler::PfTransposeInstructionsForIo": 9504.0,
990
+ "TilingProfiler::PfTransposeInstructionsForLocal": 1.0,
991
+ "TilingProfiler::PfTransposeInstructionsForNonlocal": 384.0,
992
+ "TilingProfiler::ReduceInstructionsAfterTiling": 6.0,
993
+ "TilingProfiler::SimdInstructionsAfterTiling": 165.0,
994
+ "TilingProfiler::TotalInstructionsAfterTiling": 0.0,
995
+ "TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0,
996
+ "TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0,
997
+ "TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0,
998
+ "TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0,
999
+ "TransformConvOp::conv2d_column_packing": 0.0,
1000
+ "TransformConvOp::conv2d_column_packing_1": 0.0,
1001
+ "TransformConvOp::conv2d_column_packing_io10": 0.0,
1002
+ "TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0
1003
+ }
1004
+ },
1005
+ "sg01": {
1006
+ "compiletime": {
1007
+ "CanonicalizeConv": 7.000000096013537e-06,
1008
+ "CanonicalizeForTensorizer": 1.1000000085914508e-05,
1009
+ "Canonicalizer": 0.00023700000019744039,
1010
+ "HoistCompute": 4.999999873689376e-06,
1011
+ "IdentifyCrossPassTensors": 1.2999999853491317e-05,
1012
+ "MemcastMotion": 7.999999979801942e-06,
1013
+ "PenguinizeFunctions": 1.2000000424450263e-05,
1014
+ "PruneFunctions": 1.700000029813964e-05,
1015
+ "RemoveOptimizationBarriers": 2.300000051036477e-05,
1016
+ "ScatterMotion": 1.700000029813964e-05,
1017
+ "TensorizerLegalizationPass": 1.5999999959603883e-05,
1018
+ "VerifySupportedOps": 9.999999747378752e-06,
1019
+ "algsimp": 4.70000013592653e-05,
1020
+ "batchnorm_expander": 1.2000000424450263e-05,
1021
+ "boundary-marker-removal": 4.999999873689376e-06,
1022
+ "call-inliner": 7.999999979801942e-06,
1023
+ "canonicalize-boundary-marker": 6.000000212225132e-06,
1024
+ "collective-stream-id-checker": 3.000000106112566e-06,
1025
+ "comparison-expander": 7.000000096013537e-06,
1026
+ "computation-deduplicator": 1.8999999156221747e-05,
1027
+ "config-lowering": 3.7000001611886546e-05,
1028
+ "constant_folding": 7.000000096013537e-06,
1029
+ "cse": 9.999999747378752e-06,
1030
+ "dce": 9.999999974752427e-07,
1031
+ "dynamic-slice-transpose": 3.999999989900971e-06,
1032
+ "eliminate-redundant-compare": 3.000000106112566e-06,
1033
+ "emit-offloaded-dropout": 1.2000000424450263e-05,
1034
+ "flatten-call-graph": 7.000000096013537e-06,
1035
+ "fuse-send-recv": 1.8999999156221747e-05,
1036
+ "hilo-conditional-to-select": 3.999999989900971e-06,
1037
+ "hilo::LegalizeAlias": 3.999999989900971e-06,
1038
+ "hilo::NeuronInstCombine": 5.0999999075429514e-05,
1039
+ "hilo::NeuronOpFusion": 1.700000029813964e-05,
1040
+ "hilo::ReplaceTokenTypeWithU8Pass": 1.2999999853491317e-05,
1041
+ "hilo::ScheduleFusion": 1.9999999949504854e-06,
1042
+ "hilo::SixtyFourHack": 1.2999999853491317e-05,
1043
+ "hilo::VerifyAliasing": 9.999999974752427e-07,
1044
+ "hlo-mac-count": 8.199999865610152e-05,
1045
+ "legalize-ccops-for-tensorizer": 9.999999974752427e-07,
1046
+ "legalize-compare": 3.999999989900971e-06,
1047
+ "lower-argminmax-custom-call": 3.999999989900971e-06,
1048
+ "map-inline": 1.1000000085914508e-05,
1049
+ "metadata-naming": 1.8000000636675395e-05,
1050
+ "mlir::detail::OpToOpPassAdaptor": 2.5999999706982635e-05,
1051
+ "mlir::hlo::MhloToPyPenguin": 0.0009560000034980476,
1052
+ "mlir::mhlo::LowerComplexExtraPass": 7.000000186963007e-05,
1053
+ "mlir::mhlo::LowerComplexPass": 0.00014000000373926014,
1054
+ "native-to-custom-softmax": 4.999999873689376e-06,
1055
+ "native-to-custom-softmax-dx": 1.8000000636675395e-05,
1056
+ "neuron-hlo-verifier": 0.0003600000054575503,
1057
+ "operand_upcaster": 1.4999999621068127e-05,
1058
+ "post-par-pipe-begin": 9.999999974752427e-07,
1059
+ "post-par-pipe-end": 0.0,
1060
+ "post-partition-simplification": 0.0004780000017490238,
1061
+ "replace-minimum-constant": 3.999999989900971e-06,
1062
+ "reshape-mover": 1.9999999949504854e-06,
1063
+ "simplify-concat": 3.9999998989515007e-05,
1064
+ "simplify-while-loops": 1.9999999949504854e-06,
1065
+ "transform-variadic-reduce": 9.000000318337698e-06,
1066
+ "tuple-simplifier": 3.999999989900971e-06,
1067
+ "unpack-nested-aws-ntwsr": 3.999999989900971e-06,
1068
+ "unroll-while-loop": 0.0
1069
+ },
1070
+ "hilo": {
1071
+ "ArithmeticIntensity": 374.9828186035156,
1072
+ "HloMacCount": 26843545600.0,
1073
+ "Traffic": 143172128.0
1074
+ }
1075
+ },
1076
+ "sg02": {
1077
+ "compiletime": {
1078
+ "CanonicalizeConv": 1.1000000085914508e-05,
1079
+ "CanonicalizeForTensorizer": 1.2000000424450263e-05,
1080
+ "Canonicalizer": 0.0002899999963119626,
1081
+ "HoistCompute": 3.999999989900971e-06,
1082
+ "IdentifyCrossPassTensors": 2.300000051036477e-05,
1083
+ "MemcastMotion": 1.1000000085914508e-05,
1084
+ "PenguinizeFunctions": 1.2000000424450263e-05,
1085
+ "PruneFunctions": 7.999999979801942e-06,
1086
+ "RemoveOptimizationBarriers": 1.4000000192027073e-05,
1087
+ "ScatterMotion": 3.000000106112566e-06,
1088
+ "TensorizerLegalizationPass": 7.000000096013537e-06,
1089
+ "VerifySupportedOps": 1.2000000424450263e-05,
1090
+ "algsimp": 5.6000000768108293e-05,
1091
+ "batchnorm_expander": 9.999999747378752e-06,
1092
+ "boundary-marker-removal": 3.999999989900971e-06,
1093
+ "call-inliner": 9.999999747378752e-06,
1094
+ "canonicalize-boundary-marker": 4.999999873689376e-06,
1095
+ "collective-stream-id-checker": 3.000000106112566e-06,
1096
+ "comparison-expander": 4.999999873689376e-06,
1097
+ "computation-deduplicator": 2.099999983329326e-05,
1098
+ "config-lowering": 4.400000034365803e-05,
1099
+ "constant_folding": 7.999999979801942e-06,
1100
+ "cse": 1.4000000192027073e-05,
1101
+ "dce": 9.999999974752427e-07,
1102
+ "dynamic-slice-transpose": 3.999999989900971e-06,
1103
+ "eliminate-redundant-compare": 3.000000106112566e-06,
1104
+ "emit-offloaded-dropout": 1.2999999853491317e-05,
1105
+ "flatten-call-graph": 1.1000000085914508e-05,
1106
+ "fuse-send-recv": 1.8999999156221747e-05,
1107
+ "hilo-conditional-to-select": 4.999999873689376e-06,
1108
+ "hilo::LegalizeAlias": 1.9999999949504854e-06,
1109
+ "hilo::NeuronInstCombine": 6.900000153109431e-05,
1110
+ "hilo::NeuronOpFusion": 6.000000212225132e-06,
1111
+ "hilo::ReplaceTokenTypeWithU8Pass": 1.5999999959603883e-05,
1112
+ "hilo::ScheduleFusion": 9.999999974752427e-07,
1113
+ "hilo::SixtyFourHack": 4.400000034365803e-05,
1114
+ "hilo::VerifyAliasing": 9.999999974752427e-07,
1115
+ "hlo-mac-count": 0.004767000209540129,
1116
+ "legalize-ccops-for-tensorizer": 9.999999974752427e-07,
1117
+ "legalize-compare": 3.000000106112566e-06,
1118
+ "lower-argminmax-custom-call": 3.000000106112566e-06,
1119
+ "map-inline": 1.1000000085914508e-05,
1120
+ "metadata-naming": 1.700000029813964e-05,
1121
+ "mlir::detail::OpToOpPassAdaptor": 2.9000000722589903e-05,
1122
+ "mlir::hlo::MhloToPyPenguin": 0.006047999951988459,
1123
+ "mlir::mhlo::LowerComplexExtraPass": 7.599999662488699e-05,
1124
+ "mlir::mhlo::LowerComplexPass": 9.699999645818025e-05,
1125
+ "native-to-custom-softmax": 6.000000212225132e-06,
1126
+ "native-to-custom-softmax-dx": 2.2000000171829015e-05,
1127
+ "neuron-hlo-verifier": 0.0003600000054575503,
1128
+ "operand_upcaster": 1.700000029813964e-05,
1129
+ "post-par-pipe-begin": 3.000000106112566e-06,
1130
+ "post-par-pipe-end": 0.0,
1131
+ "post-partition-simplification": 0.0005520000122487545,
1132
+ "replace-minimum-constant": 9.000000318337698e-06,
1133
+ "reshape-mover": 3.000000106112566e-06,
1134
+ "simplify-concat": 4.199999966658652e-05,
1135
+ "simplify-while-loops": 1.9999999949504854e-06,
1136
+ "transform-variadic-reduce": 4.70000013592653e-05,
1137
+ "tuple-simplifier": 4.999999873689376e-06,
1138
+ "unpack-nested-aws-ntwsr": 3.999999989900971e-06,
1139
+ "unroll-while-loop": 0.0
1140
+ },
1141
+ "hilo": {
1142
+ "ArithmeticIntensity": 107.69713592529297,
1143
+ "HloMacCount": 19483983872.0,
1144
+ "Traffic": 361829184.0
1145
+ }
1146
+ },
1147
+ "topk": {
1148
+ "compiletime": {
1149
+ "CoalesceCCOp": 0.0069692134857177734,
1150
+ "DMALocalityOpt": 0.006772279739379883,
1151
+ "DMAProfiler": 0.008215665817260742,
1152
+ "DataStreaming": 0.012622594833374023,
1153
+ "DoNothing": 0.004723310470581055,
1154
+ "ExpandISAMacro": 0.007757902145385742,
1155
+ "FactorizeBlkDims": 0.030848026275634766,
1156
+ "InferPSumTensor": 0.023444652557373047,
1157
+ "InferSharedMemLoc": 0.010675668716430664,
1158
+ "InsertCoreBarrier": 0.007489681243896484,
1159
+ "LateLegalizeInst": 0.01503753662109375,
1160
+ "LateNeuronInstComb": 0.017124652862548828,
1161
+ "LegalizeSundaAccess": 0.028142213821411133,
1162
+ "LegalizeType": 0.02222132682800293,
1163
+ "LowerBroadcast": 0.006150484085083008,
1164
+ "LowerIntrinsics": 0.00729680061340332,
1165
+ "LowerTranspose": 0.006754398345947266,
1166
+ "NeuronInstComb": 0.016539335250854492,
1167
+ "NeuronLICM": 0.024366140365600586,
1168
+ "NeuronSimplifyPredicates": 0.006876230239868164,
1169
+ "NeuronValueNumbering": 0.007918596267700195,
1170
+ "SFKVectorizer": 0.11957359313964844,
1171
+ "SimpleAllReduceTiling": 0.016579627990722656,
1172
+ "SimplifyNeuronTensor": 0.10249876976013184,
1173
+ "SpillPSum": 0.03416609764099121,
1174
+ "WeightCoalescing": 0.009296655654907227
1175
+ }
1176
+ }
1177
+ }
context_encoding_model/_tp0_bk3/graph.neff ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bdea9302d0f9d0785d148992ac29a3b377a867a1a9ce89c40e3ccad020e4ef73
3
+ size 1506304
context_encoding_model/_tp0_bk3/log-neuron-cc.txt ADDED
The diff for this file is too large to render. See raw diff
 
context_encoding_model/_tp0_bk3/metaneff.pb ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1a788ec9ea41bfa0696307ae7b82f6644a908b0b0a1feb7f30da3ca4349d0c13
3
+ size 2955932
context_encoding_model/_tp0_bk3/model.MODULE_be035899334776123ed5+d208bdce.hlo_module.pb ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:616f0c948889cd427dac21bbe629a046747b018871cf2815b3477d1f3d54d269
3
+ size 3042718
context_encoding_model/_tp0_bk3/model.MODULE_be035899334776123ed5+d208bdce.neff ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bdea9302d0f9d0785d148992ac29a3b377a867a1a9ce89c40e3ccad020e4ef73
3
+ size 1506304
context_encoding_model/_tp0_bk3/neuron_config.json ADDED
@@ -0,0 +1,224 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_attn_implementation_autoset": false,
3
+ "_name_or_path": "/home/ubuntu/models/Qwen3-1.7B",
4
+ "add_cross_attention": false,
5
+ "architectures": [
6
+ "Qwen3ForCausalLM"
7
+ ],
8
+ "attention_bias": false,
9
+ "attention_dropout": 0.0,
10
+ "attribute_map": {},
11
+ "bad_words_ids": null,
12
+ "begin_suppress_tokens": null,
13
+ "bos_token_id": 151643,
14
+ "chunk_size_feed_forward": 0,
15
+ "cross_attention_hidden_size": null,
16
+ "decoder_start_token_id": null,
17
+ "diversity_penalty": 0.0,
18
+ "do_sample": false,
19
+ "early_stopping": false,
20
+ "encoder_no_repeat_ngram_size": 0,
21
+ "eos_token_id": 151645,
22
+ "exponential_decay_length_penalty": null,
23
+ "finetuning_task": null,
24
+ "forced_bos_token_id": null,
25
+ "forced_eos_token_id": null,
26
+ "fused_spec_config": null,
27
+ "head_dim": 128,
28
+ "hidden_act": "silu",
29
+ "hidden_size": 2048,
30
+ "id2label": {
31
+ "0": "LABEL_0",
32
+ "1": "LABEL_1"
33
+ },
34
+ "initializer_range": 0.02,
35
+ "intermediate_size": 6144,
36
+ "is_decoder": false,
37
+ "is_encoder_decoder": false,
38
+ "label2id": {
39
+ "LABEL_0": 0,
40
+ "LABEL_1": 1
41
+ },
42
+ "length_penalty": 1.0,
43
+ "max_length": 20,
44
+ "max_position_embeddings": 40960,
45
+ "max_window_layers": 28,
46
+ "metadata": null,
47
+ "min_length": 0,
48
+ "model_type": "qwen3",
49
+ "neuron_config": {
50
+ "activation_quantization_type": null,
51
+ "allow_input_truncation": false,
52
+ "apply_seq_ids_mask": false,
53
+ "async_mode": false,
54
+ "attention_dp_degree": 1,
55
+ "attention_dtype": null,
56
+ "attn_block_cte_nki_kernel_enabled": false,
57
+ "attn_block_tkg_nki_kernel_cache_update": false,
58
+ "attn_block_tkg_nki_kernel_cascaded_attention": false,
59
+ "attn_block_tkg_nki_kernel_enabled": false,
60
+ "attn_cls": {
61
+ "__module__": "neuronx_distributed_inference.models.qwen3.modeling_qwen3",
62
+ "__name__": "NeuronQwen3Attention"
63
+ },
64
+ "attn_kernel_enabled": null,
65
+ "attn_tkg_builtin_kernel_enabled": false,
66
+ "attn_tkg_nki_kernel_enabled": false,
67
+ "batch_size": 1,
68
+ "bucket_n_active_tokens": true,
69
+ "buckets": [
70
+ 1024
71
+ ],
72
+ "cast_type": "config",
73
+ "cc_pipeline_tiling_factor": 2,
74
+ "chunked_prefill_config": null,
75
+ "context_encoding_buckets": [
76
+ 1024
77
+ ],
78
+ "cp_degree": 1,
79
+ "ctx_batch_size": 1,
80
+ "disable_kv_cache_tiling": false,
81
+ "draft_model_modules_to_not_convert": null,
82
+ "enable_bucketing": true,
83
+ "enable_cte_modular_flow": false,
84
+ "enable_eagle_draft_input_norm": false,
85
+ "enable_eagle_speculation": false,
86
+ "enable_fused_speculation": false,
87
+ "enable_long_context_mode": false,
88
+ "enable_output_completion_notifications": false,
89
+ "enable_spill_reload_dge": false,
90
+ "enable_token_tree": false,
91
+ "ep_degree": 1,
92
+ "expert_mlp_nki_kernel_enabled": null,
93
+ "flash_decoding_enabled": false,
94
+ "fused_qkv": false,
95
+ "fused_rmsnorm_skip_gamma": false,
96
+ "is_block_kv_layout": null,
97
+ "is_chunked_prefill": false,
98
+ "is_continuous_batching": true,
99
+ "is_eagle_draft": false,
100
+ "is_medusa": false,
101
+ "is_prefill_stage": true,
102
+ "is_prefix_caching": false,
103
+ "k_cache_transposed": false,
104
+ "kv_cache_batch_size": 8,
105
+ "kv_cache_padding_size": 0,
106
+ "kv_cache_quant": false,
107
+ "kv_cache_tiling": false,
108
+ "layer_boundary_markers": false,
109
+ "lm_head_pad": true,
110
+ "lm_head_pad_alignment_size": 1,
111
+ "local_ranks_size": 2,
112
+ "logical_nc_config": 2,
113
+ "lora_config": null,
114
+ "max_batch_size": 8,
115
+ "max_context_length": 4096,
116
+ "max_length": 4096,
117
+ "max_new_tokens": null,
118
+ "medusa_speculation_length": 0,
119
+ "medusa_tree": null,
120
+ "mlp_kernel_enabled": false,
121
+ "mlp_kernel_fuse_residual_add": false,
122
+ "modules_to_not_convert": null,
123
+ "moe_fused_nki_kernel_enabled": null,
124
+ "n_active_tokens": 4096,
125
+ "n_positions": 4096,
126
+ "num_medusa_heads": 0,
127
+ "on_cpu": false,
128
+ "on_device_sampling_config": {
129
+ "deterministic": false,
130
+ "do_sample": false,
131
+ "dynamic": true,
132
+ "global_topk": 256,
133
+ "on_device_sampling_config": true,
134
+ "temperature": 1.0,
135
+ "top_k": 1,
136
+ "top_k_kernel_enabled": false,
137
+ "top_p": 1.0
138
+ },
139
+ "output_logits": false,
140
+ "overrides_torch_dtype": true,
141
+ "pa_block_size": 4096,
142
+ "pa_num_blocks": 8,
143
+ "padding_side": "right",
144
+ "pp_degree": 1,
145
+ "prefix_buckets": null,
146
+ "qk_layernorm": false,
147
+ "qkv_kernel_enabled": false,
148
+ "qkv_kernel_fuse_residual_add": false,
149
+ "qkv_kernel_nbsd_layout": false,
150
+ "quantization_dtype": "int8",
151
+ "quantization_type": "per_tensor_symmetric",
152
+ "quantize_clamp_bound": Infinity,
153
+ "quantized": false,
154
+ "quantized_checkpoints_path": null,
155
+ "quantized_mlp_kernel_enabled": false,
156
+ "rmsnorm_quantize_kernel_enabled": false,
157
+ "router_topk_nki_kernel_enabled": null,
158
+ "rpl_reduce_dtype": null,
159
+ "save_sharded_checkpoint": true,
160
+ "scratchpad_page_size": null,
161
+ "seq_len": 4096,
162
+ "seq_len_threshold_for_cc_tiling": 16384,
163
+ "sequence_parallel_enabled": false,
164
+ "shared_mlp_nki_kernel_enabled": null,
165
+ "skip_sharding": false,
166
+ "skip_warmup": false,
167
+ "spec_batch_size": 8,
168
+ "speculation_length": 0,
169
+ "start_rank_id": 0,
170
+ "strided_context_parallel_kernel_enabled": false,
171
+ "target": null,
172
+ "tensor_capture_config": null,
173
+ "tile_cc": false,
174
+ "tkg_batch_size": 8,
175
+ "token_generation_buckets": null,
176
+ "token_tree_config": null,
177
+ "torch_dtype": "bfloat16",
178
+ "tp_degree": 2,
179
+ "vocab_parallel": false,
180
+ "weight_gather_seq_len_threshold": 32768,
181
+ "weights_to_skip_layout_optimization": [],
182
+ "world_size": 2
183
+ },
184
+ "no_repeat_ngram_size": 0,
185
+ "num_attention_heads": 16,
186
+ "num_beam_groups": 1,
187
+ "num_beams": 1,
188
+ "num_cores_per_group": 1,
189
+ "num_hidden_layers": 28,
190
+ "num_key_value_heads": 8,
191
+ "num_return_sequences": 1,
192
+ "output_attentions": false,
193
+ "output_hidden_states": false,
194
+ "output_scores": false,
195
+ "pad_token_id": 0,
196
+ "prefix": null,
197
+ "problem_type": null,
198
+ "pruned_heads": {},
199
+ "remove_invalid_values": false,
200
+ "repetition_penalty": 1.0,
201
+ "return_dict": true,
202
+ "return_dict_in_generate": false,
203
+ "rms_norm_eps": 1e-06,
204
+ "rope_scaling": null,
205
+ "rope_theta": 1000000,
206
+ "sep_token_id": null,
207
+ "sliding_window": null,
208
+ "suppress_tokens": null,
209
+ "task_specific_params": null,
210
+ "temperature": 1.0,
211
+ "tf_legacy_loss": false,
212
+ "tie_encoder_decoder": false,
213
+ "tie_word_embeddings": true,
214
+ "tokenizer_class": null,
215
+ "top_k": 50,
216
+ "top_p": 1.0,
217
+ "torchscript": false,
218
+ "transformers_version": "4.51.0",
219
+ "typical_p": 1.0,
220
+ "use_bfloat16": false,
221
+ "use_cache": true,
222
+ "use_sliding_window": false,
223
+ "vocab_size": 151936
224
+ }
context_encoding_model/_tp0_bk4/command.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ neuronx-cc compile --framework=XLA model.MODULE_95ef7ca73cc0a6161be2+96be3c33.hlo_module.pb --output model.MODULE_95ef7ca73cc0a6161be2+96be3c33.neff --target=trn2 --auto-cast=none --model-type=transformer '--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma ' --lnc=2 -O1 '--internal-hlo2tensorizer-options= --modular-flow-mac-threshold=10 --verify-hlo=true' --logfile=log-neuron-cc.txt --verbose=35
context_encoding_model/_tp0_bk4/compile_flags.MODULE_95ef7ca73cc0a6161be2+96be3c33.json ADDED
@@ -0,0 +1 @@
 
 
1
+ ["--target=trn2", "--auto-cast=none", "--model-type=transformer", "--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma ", "--lnc=2", "-O1", "--internal-hlo2tensorizer-options= --modular-flow-mac-threshold=10 --verify-hlo=true", "--logfile=/home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk4/log-neuron-cc.txt"]
context_encoding_model/_tp0_bk4/global_metric_store.json ADDED
@@ -0,0 +1,1177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "Average": {
3
+ "tensorizer": {
4
+ "StaticProfiler::AverageFractalPeUtilization": 98.93502807617188,
5
+ "StaticProfiler::AveragePartitionUtilization": 95.0970230102539,
6
+ "StaticProfiler::AveragePeUtilization": 97.18069458007813,
7
+ "StaticProfiler::LocalizationEfficiency": 73.73954010009766,
8
+ "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 79.92718505859375,
9
+ "TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0,
10
+ "TilingProfiler::AveragePeUtilizationAfterTiling": 0.0
11
+ }
12
+ },
13
+ "Count": {
14
+ "tensorizer": {
15
+ "StaticProfiler::AverageFractalPeUtilization": 1.0,
16
+ "StaticProfiler::AveragePartitionUtilization": 1.0,
17
+ "StaticProfiler::AveragePeUtilization": 1.0,
18
+ "StaticProfiler::LocalizationEfficiency": 1.0,
19
+ "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 1.0,
20
+ "TilingProfiler::AveragePartitionUtilizationAfterTiling": 1.0,
21
+ "TilingProfiler::AveragePeUtilizationAfterTiling": 1.0
22
+ }
23
+ },
24
+ "Sum": {
25
+ "compiletime": {
26
+ "AGOrderingAnalysisPass": 0.04760026931762695,
27
+ "AffinePredicateResolution": 0.003319978713989258,
28
+ "AliasDependencyElimination": 0.0002167224884033203,
29
+ "AliasDependencyInduction": 0.008548259735107422,
30
+ "AliasDependencyReset": 0.03149843215942383,
31
+ "BFComputeCutting": 0.00810694694519043,
32
+ "BirCodeGenLoop": 0.2911098003387451,
33
+ "CCOpFusion": 0.08548593521118164,
34
+ "CanonicalizeConv": 2.7000001864507794e-05,
35
+ "CanonicalizeDAGForPGTiling": 0.007600545883178711,
36
+ "CanonicalizeForTensorizer": 5.699999746866524e-05,
37
+ "CanonicalizeIR": 0.0030400753021240234,
38
+ "Canonicalizer": 0.0011950000189244747,
39
+ "CoalesceCCOp": 0.020453453063964844,
40
+ "CommuteConcat": 0.007961034774780273,
41
+ "DMALocalityOpt": 0.016626834869384766,
42
+ "DMAProfiler": 0.018386363983154297,
43
+ "DMATilingProfiler": 0.009016752243041992,
44
+ "DataLocalityOpt": 0.17029356956481934,
45
+ "DataStreaming": 0.03981828689575195,
46
+ "DeConcat": 0.01120138168334961,
47
+ "DeadCodeElimination": 0.010882377624511719,
48
+ "DeadStoreElimination": 0.010195016860961914,
49
+ "DelinearIndices": 0.010077953338623047,
50
+ "Delinearization": 0.011870861053466797,
51
+ "DelinearizeSPMD": 0.035944223403930664,
52
+ "DoNothing": 0.0005605220794677734,
53
+ "DramToDramTranspose": 0.013046979904174805,
54
+ "DumpGraphAndMetadata": 0.03416705131530762,
55
+ "EliminateDivs": 0.004259586334228516,
56
+ "ExpandBatchNorm": 0.0017371177673339844,
57
+ "ExpandISAMacro": 0.014496326446533203,
58
+ "FactorizeBlkDims": 0.07086968421936035,
59
+ "FactorizeThreadAxesInFreeDims": 0.00911855697631836,
60
+ "FlattenMacroLoop": 0.0048520565032958984,
61
+ "GenericAccessSimplifier": 0.001367330551147461,
62
+ "HoistCompute": 6.000000212225132e-06,
63
+ "IdentifyCrossPassTensors": 6.199999916134402e-05,
64
+ "InferInitValue": 0.0836641788482666,
65
+ "InferIntrinsicOnCC": 0.008740901947021484,
66
+ "InferNeuronTensor": 0.05709338188171387,
67
+ "InferNonlocalTensors": 0.041548728942871094,
68
+ "InferPSumTensor": 0.23330998420715332,
69
+ "InferShardAxis": 0.5781030654907227,
70
+ "InferSharedMemLoc": 0.03158235549926758,
71
+ "InlineNativeKernels": 0.002477407455444336,
72
+ "InsertCoreBarrier": 0.015990734100341797,
73
+ "InsertIOTransposes": 0.039937734603881836,
74
+ "InsertImplicitShardAxisBeforeISel": 0.013466596603393555,
75
+ "InsertLocalTransposes": 0.018125534057617188,
76
+ "InsertOffloadedTransposes": 0.014874696731567383,
77
+ "LICM": 0.0058231353759765625,
78
+ "LateLegalizeInst": 0.037004947662353516,
79
+ "LateLegalizePostSplit": 0.02429652214050293,
80
+ "LateLowerReshapeOp": 0.0018832683563232422,
81
+ "LateLowerTensorOp": 0.0021920204162597656,
82
+ "LateNeuronInstComb": 0.06391644477844238,
83
+ "LayoutPreprocessing": 0.06973385810852051,
84
+ "LayoutPreprocessingAndAnalysis": 0.11140203475952148,
85
+ "LayoutRequirementAnalysis": 0.013022661209106445,
86
+ "LegalizeCCOpLayout": 0.0020427703857421875,
87
+ "LegalizeOpLevelAlias": 0.0016918182373046875,
88
+ "LegalizePartitionReduce": 0.0030241012573242188,
89
+ "LegalizeSundaAccess": 0.08372640609741211,
90
+ "LegalizeSundaMacro": 0.02708148956298828,
91
+ "LegalizeType": 0.04078388214111328,
92
+ "LocalLayoutOpt": 0.022045135498046875,
93
+ "LoopFusion": 0.029404163360595703,
94
+ "LoopSplitting": 0.0007355213165283203,
95
+ "LowerBroadcast": 0.02869558334350586,
96
+ "LowerCCOpBlockAxis": 0.007714748382568359,
97
+ "LowerComplexBroadcast": 0.005654096603393555,
98
+ "LowerIntrinsics": 0.051032304763793945,
99
+ "LowerShardAxis": 0.03305673599243164,
100
+ "LowerTensorOp": 0.028458356857299805,
101
+ "LowerToSendRecv": 0.03391242027282715,
102
+ "LowerTranspose": 0.051642656326293945,
103
+ "MacroGeneration": 0.06428074836730957,
104
+ "MaskPropagation": 0.0036263465881347656,
105
+ "MemcastMotion": 1.700000029813964e-05,
106
+ "MemcpyElimination": 0.05451250076293945,
107
+ "MutateDataType": 0.001516103744506836,
108
+ "NeuronAliasDependencyInduction": 0.0005834102630615234,
109
+ "NeuronAliasDependencyReset": 0.022034168243408203,
110
+ "NeuronInstComb": 0.06097984313964844,
111
+ "NeuronLICM": 0.05481839179992676,
112
+ "NeuronLoopFusion": 0.07339620590209961,
113
+ "NeuronLoopInterchange": 0.0027348995208740234,
114
+ "NeuronSimplifier": 0.021918296813964844,
115
+ "NeuronSimplifyPredicates": 0.024098873138427734,
116
+ "NeuronValueNumbering": 0.022985458374023438,
117
+ "OptimizeAliasedCopyChain": 0.0008976459503173828,
118
+ "OptimizeNKIKernels": 4.611967086791992,
119
+ "PAGLayoutOpt": 0.2917053699493408,
120
+ "PComputeCutting": 0.008776664733886719,
121
+ "PGLayoutTilingPipeline": 1.8517823219299316,
122
+ "PGTiling": 0.26313185691833496,
123
+ "PadElimination": 0.0006458759307861328,
124
+ "ParAxesAnnotation": 0.188338041305542,
125
+ "PartialLoopFusion": 0.05682229995727539,
126
+ "PartialSimdFusion": 0.0237729549407959,
127
+ "PenguinizeFunctions": 5.5999997130129486e-05,
128
+ "PerfectLoopNest": 0.00557398796081543,
129
+ "PruneFunctions": 3.9999998989515007e-05,
130
+ "RecognizeOpIdiom": 0.008669376373291016,
131
+ "Recompute": 0.0005908012390136719,
132
+ "RelaxPredicates": 0.006473541259765625,
133
+ "Rematerialization": 0.011237144470214844,
134
+ "RemoveOptimizationBarriers": 7.400000322377309e-05,
135
+ "RemoveShardedPartitionAxes": 0.014671802520751953,
136
+ "ReshapeWeights": 0.0018546581268310547,
137
+ "ResolveAccessConflict": 0.008959770202636719,
138
+ "ResolveComplicatePredicates": 0.0009264945983886719,
139
+ "RewriteReplicationMatmul": 0.0037200450897216797,
140
+ "RewriteWeights": 0.008005380630493164,
141
+ "SFKVectorizer": 0.2967853546142578,
142
+ "ScatterMotion": 1.900000097521115e-05,
143
+ "ShardingPropagationAnalysis": 0.10689902305603027,
144
+ "SimpleAllReduceTiling": 0.010908842086791992,
145
+ "Simplifier": 0.00808858871459961,
146
+ "SimplifyMacroPredicates": 0.031823158264160156,
147
+ "SimplifyNeuronTensor": 0.12780547142028809,
148
+ "SimplifySlice": 0.001531362533569336,
149
+ "SimplifyTensor": 0.018309593200683594,
150
+ "SpillPSum": 0.09417366981506348,
151
+ "SplitAPUnionSets": 0.09693408012390137,
152
+ "SplitAccGrp": 0.0025701522827148438,
153
+ "StaticProfiler": 0.04053521156311035,
154
+ "StaticTransposeLocalTensor": 0.012635231018066406,
155
+ "SundaISel": 0.10333561897277832,
156
+ "TCTransform": 0.006776332855224609,
157
+ "TensorInitialization": 0.011014938354492188,
158
+ "TensorOpSimplifier": 0.005452632904052734,
159
+ "TensorOpTransform": 0.033481597900390625,
160
+ "TensorizerLegalizationPass": 6.399999983841553e-05,
161
+ "TileCCOps": 0.011636972427368164,
162
+ "TilingProfiler": 0.024947643280029297,
163
+ "TransformConvOp": 0.013001441955566406,
164
+ "TritiumFusion": 0.1458723545074463,
165
+ "ValueNumbering": 0.003311634063720703,
166
+ "VectorizeDMA": 0.005986928939819336,
167
+ "VectorizeMatMult": 0.028806686401367188,
168
+ "VerifySupportedOps": 5.100000271340832e-05,
169
+ "WeightCoalescing": 0.01451730728149414,
170
+ "ZeroSizeTensorElimination": 0.00017833709716796875,
171
+ "algsimp": 0.0020910000894218683,
172
+ "batchnorm_expander": 5.0000002374872565e-05,
173
+ "boundary-marker-removal": 1.900000097521115e-05,
174
+ "call-inliner": 0.00046300000394694507,
175
+ "canonicalize-boundary-marker": 2.300000051036477e-05,
176
+ "collective-stream-id-checker": 8.800000068731606e-05,
177
+ "comparison-expander": 0.0005719999899156392,
178
+ "computation-deduplicator": 8.399999933317304e-05,
179
+ "config-lowering": 0.00016599999798927456,
180
+ "constant-statistics": 0.0004529999860096723,
181
+ "constant_folding": 0.00018699999782256782,
182
+ "cse": 6.299999949987978e-05,
183
+ "dce": 4.400000034365803e-05,
184
+ "dot_decomposer": 0.001028000027872622,
185
+ "dynamic-slice-transpose": 2.0000001313746907e-05,
186
+ "eliminate-redundant-compare": 0.00014699999883305281,
187
+ "emit-offloaded-dropout": 5.499999679159373e-05,
188
+ "flatten-call-graph": 0.0006470000371336937,
189
+ "fuse-send-recv": 9.600000339560211e-05,
190
+ "hilo-conditional-to-select": 2.9000000722589903e-05,
191
+ "hilo::LegalizeAlias": 1.500000053056283e-05,
192
+ "hilo::NeuronInstCombine": 0.00012700000661425292,
193
+ "hilo::NeuronOpFusion": 4.099999932805076e-05,
194
+ "hilo::ReplaceTokenTypeWithU8Pass": 7.400000322377309e-05,
195
+ "hilo::ScheduleFusion": 3.000000106112566e-06,
196
+ "hilo::SixtyFourHack": 9.599999611964449e-05,
197
+ "hilo::VerifyAliasing": 6.000000212225132e-06,
198
+ "hlo-mac-count": 0.015143999829888344,
199
+ "instruction-histogram": 0.0010160000529140234,
200
+ "io-con-pipe-begin": 4.999999873689376e-06,
201
+ "io-con-pipe-end": 0.0,
202
+ "io-layout-normalization": 0.0007440000190399587,
203
+ "io-statistics": 3.9999998989515007e-05,
204
+ "legalize-ccops-for-tensorizer": 6.000000212225132e-06,
205
+ "legalize-compare": 1.3999999282532372e-05,
206
+ "lower-argminmax-custom-call": 1.5999999959603883e-05,
207
+ "map-inline": 0.0008340000058524311,
208
+ "metadata-naming": 7.79999973019585e-05,
209
+ "mlir::detail::OpToOpPassAdaptor": 5.8000001445179805e-05,
210
+ "mlir::hlo::MhloToPyPenguin": 0.013376999646425247,
211
+ "mlir::mhlo::LowerComplexExtraPass": 0.00022300001000985503,
212
+ "mlir::mhlo::LowerComplexPass": 0.0004149999876972288,
213
+ "native-to-custom-softmax": 0.0003029999788850546,
214
+ "native-to-custom-softmax-dx": 0.0021089999936521053,
215
+ "neuron-hlo-verifier": 0.011952999979257584,
216
+ "operand_upcaster": 5.5999997130129486e-05,
217
+ "opt-barrier-removal": 0.00026000000070780516,
218
+ "post-par-pipe-begin": 0.0003480000013951212,
219
+ "post-par-pipe-end": 0.0,
220
+ "post-partition-simplification": 0.002303000073879957,
221
+ "pre-par-pipe-begin": 9.999999974752427e-07,
222
+ "pre-par-pipe-end": 0.0,
223
+ "pre-partition-simplification": 0.07090699672698975,
224
+ "replace-minimum-constant": 0.0003819999983534217,
225
+ "reshape-mover": 6.299999949987978e-05,
226
+ "simplify-concat": 0.00014800000644754618,
227
+ "simplify-while-loops": 9.100000170292333e-05,
228
+ "transform-variadic-reduce": 9.299999510403723e-05,
229
+ "tuple-simplifier": 0.0001649999903747812,
230
+ "unpack-nested-aws-ntwsr": 0.00024099998699966818,
231
+ "unroll-while-loop": 3.5000000934815034e-05,
232
+ "zero_sized_hlo_elimination": 0.00072900002123788
233
+ },
234
+ "hilo": {
235
+ "ConstantSize": 3678847.0,
236
+ "HloInputCount": 371.0,
237
+ "HloMacCount": 111825780736.0,
238
+ "HloOutputCount": 57.0,
239
+ "IfmapSize": 3910928384.0,
240
+ "OfmapSize": 1879048192.0,
241
+ "OutputsReadFromCount": 0.0,
242
+ "PassthroughTensorsCount": 0.0,
243
+ "RedundantOutputCount": 0.0,
244
+ "Traffic": 973052032.0
245
+ },
246
+ "tensorizer": {
247
+ "DMATilingProfiler::TotalInstructionsAfterTiling": 25519.0,
248
+ "StaticProfiler::AifUb": 337.1839904785156,
249
+ "StaticProfiler::ArithmeticIntensityTensorizer": 248.63792419433594,
250
+ "StaticProfiler::AverageDmaLength": 2413.602294921875,
251
+ "StaticProfiler::DDRTransferBytes": 495991840.0,
252
+ "StaticProfiler::InternalTransferBytes": 361682720.0,
253
+ "StaticProfiler::LoadExpanded": 133728.0,
254
+ "StaticProfiler::StoreExpanded": 7530.0,
255
+ "StaticProfiler::TotalDMAExpanded": 141258.0,
256
+ "StaticProfiler::TotalDynamicInstancesCount": 30781.0,
257
+ "StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 30330.0,
258
+ "StaticProfiler::TotalLNCComm": 0.0,
259
+ "StaticProfiler::TotalLNCCommTransfer": 0.0,
260
+ "TilingProfiler::BatchnormInstructionsAfterTiling": 0.0,
261
+ "TilingProfiler::DmaInstructionsAfterTiling": 0.0,
262
+ "TilingProfiler::GenericInstructionsAfterTiling": 4.0,
263
+ "TilingProfiler::MatMultInstructionsAfterTiling": 14112.0,
264
+ "TilingProfiler::NumPfTransposes": 5.0,
265
+ "TilingProfiler::NumPfTransposesForIo": 1.0,
266
+ "TilingProfiler::NumPfTransposesForLocal": 1.0,
267
+ "TilingProfiler::NumPfTransposesForNonlocal": 3.0,
268
+ "TilingProfiler::PfTransposeInstructions": 10273.0,
269
+ "TilingProfiler::PfTransposeInstructionsForIo": 9504.0,
270
+ "TilingProfiler::PfTransposeInstructionsForLocal": 1.0,
271
+ "TilingProfiler::PfTransposeInstructionsForNonlocal": 768.0,
272
+ "TilingProfiler::ReduceInstructionsAfterTiling": 10.0,
273
+ "TilingProfiler::SimdInstructionsAfterTiling": 311.0,
274
+ "TilingProfiler::TotalInstructionsAfterTiling": 0.0,
275
+ "TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0,
276
+ "TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0,
277
+ "TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0,
278
+ "TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0,
279
+ "TransformConvOp::conv2d_column_packing": 0.0,
280
+ "TransformConvOp::conv2d_column_packing_1": 0.0,
281
+ "TransformConvOp::conv2d_column_packing_io10": 0.0,
282
+ "TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0
283
+ }
284
+ },
285
+ "all": {
286
+ "compiletime": {
287
+ "algsimp": 0.0017770000267773867,
288
+ "call-inliner": 0.00041700000292621553,
289
+ "collective-stream-id-checker": 5.500000042957254e-05,
290
+ "comparison-expander": 0.0005280000041238964,
291
+ "constant-statistics": 0.0004529999860096723,
292
+ "constant_folding": 0.0001429999974789098,
293
+ "dce": 3.899999865097925e-05,
294
+ "dot_decomposer": 0.001028000027872622,
295
+ "eliminate-redundant-compare": 0.0001320000010309741,
296
+ "flatten-call-graph": 0.0006070000235922635,
297
+ "hlo-mac-count": 0.007338999770581722,
298
+ "instruction-histogram": 0.0010160000529140234,
299
+ "io-con-pipe-begin": 4.999999873689376e-06,
300
+ "io-con-pipe-end": 0.0,
301
+ "io-layout-normalization": 0.0007440000190399587,
302
+ "io-statistics": 3.9999998989515007e-05,
303
+ "map-inline": 0.0007900000200606883,
304
+ "native-to-custom-softmax": 0.00028199999360367656,
305
+ "native-to-custom-softmax-dx": 0.00042699999175965786,
306
+ "neuron-hlo-verifier": 0.010262000374495983,
307
+ "opt-barrier-removal": 0.00026000000070780516,
308
+ "pre-par-pipe-begin": 9.999999974752427e-07,
309
+ "pre-par-pipe-end": 0.0,
310
+ "pre-partition-simplification": 0.07090699672698975,
311
+ "replace-minimum-constant": 0.0003480000013951212,
312
+ "reshape-mover": 4.8999998398358e-05,
313
+ "simplify-while-loops": 8.099999831756577e-05,
314
+ "tuple-simplifier": 0.0001429999974789098,
315
+ "unpack-nested-aws-ntwsr": 0.00022499999613501132,
316
+ "unroll-while-loop": 1.2000000424450263e-05,
317
+ "zero_sized_hlo_elimination": 0.00072900002123788
318
+ }
319
+ },
320
+ "attention_isa_kernel": {
321
+ "compiletime": {
322
+ "CoalesceCCOp": 0.00029277801513671875,
323
+ "DMALocalityOpt": 0.00019669532775878906,
324
+ "DMAProfiler": 0.0002949237823486328,
325
+ "DataStreaming": 0.0002338886260986328,
326
+ "DoNothing": 0.0014209747314453125,
327
+ "ExpandISAMacro": 0.00028014183044433594,
328
+ "FactorizeBlkDims": 0.0051081180572509766,
329
+ "InferPSumTensor": 0.0036172866821289063,
330
+ "InferSharedMemLoc": 0.0005719661712646484,
331
+ "InsertCoreBarrier": 0.0023279190063476563,
332
+ "LateLegalizeInst": 0.0016858577728271484,
333
+ "LateNeuronInstComb": 0.00044226646423339844,
334
+ "LegalizeSundaAccess": 0.0002193450927734375,
335
+ "LegalizeType": 0.002800464630126953,
336
+ "LowerBroadcast": 0.0002620220184326172,
337
+ "LowerIntrinsics": 0.0003139972686767578,
338
+ "LowerTranspose": 0.0002512931823730469,
339
+ "NeuronInstComb": 0.0005278587341308594,
340
+ "NeuronLICM": 0.0002562999725341797,
341
+ "NeuronSimplifyPredicates": 0.0002334117889404297,
342
+ "NeuronValueNumbering": 0.0002815723419189453,
343
+ "SFKVectorizer": 0.005394458770751953,
344
+ "SimpleAllReduceTiling": 0.0003223419189453125,
345
+ "SimplifyNeuronTensor": 0.0007545948028564453,
346
+ "SpillPSum": 0.0006477832794189453,
347
+ "WeightCoalescing": 0.00023102760314941406
348
+ }
349
+ },
350
+ "cumsum": {
351
+ "compiletime": {
352
+ "CoalesceCCOp": 0.00034165382385253906,
353
+ "DMALocalityOpt": 0.0003287792205810547,
354
+ "DMAProfiler": 0.001161336898803711,
355
+ "DataStreaming": 0.0004813671112060547,
356
+ "DoNothing": 0.00018596649169921875,
357
+ "ExpandISAMacro": 0.0008256435394287109,
358
+ "FactorizeBlkDims": 0.0007493495941162109,
359
+ "InferPSumTensor": 0.0011432170867919922,
360
+ "InferSharedMemLoc": 0.00045013427734375,
361
+ "InsertCoreBarrier": 0.00044918060302734375,
362
+ "LateLegalizeInst": 0.0019235610961914063,
363
+ "LateNeuronInstComb": 0.0011394023895263672,
364
+ "LegalizeSundaAccess": 0.002297639846801758,
365
+ "LegalizeType": 0.00036334991455078125,
366
+ "LowerBroadcast": 0.0003592967987060547,
367
+ "LowerIntrinsics": 0.000362396240234375,
368
+ "LowerTranspose": 0.0003514289855957031,
369
+ "NeuronInstComb": 0.0034132003784179688,
370
+ "NeuronLICM": 0.0006377696990966797,
371
+ "NeuronSimplifyPredicates": 0.0035140514373779297,
372
+ "NeuronValueNumbering": 0.001703023910522461,
373
+ "SFKVectorizer": 0.009377241134643555,
374
+ "SimpleAllReduceTiling": 0.0003190040588378906,
375
+ "SimplifyNeuronTensor": 0.0036399364471435547,
376
+ "SpillPSum": 0.0008790493011474609,
377
+ "WeightCoalescing": 0.0003619194030761719
378
+ }
379
+ },
380
+ "sg00": {
381
+ "compiletime": {
382
+ "CanonicalizeConv": 1.9999999949504854e-06,
383
+ "CanonicalizeForTensorizer": 1.8999999156221747e-05,
384
+ "Canonicalizer": 0.0004579999949783087,
385
+ "HoistCompute": 3.000000106112566e-06,
386
+ "IdentifyCrossPassTensors": 2.5999999706982635e-05,
387
+ "MemcastMotion": 9.999999747378752e-06,
388
+ "PenguinizeFunctions": 1.9999999494757503e-05,
389
+ "PruneFunctions": 1.2000000424450263e-05,
390
+ "RemoveOptimizationBarriers": 2.499999936844688e-05,
391
+ "ScatterMotion": 9.999999747378752e-06,
392
+ "TensorizerLegalizationPass": 3.199999991920777e-05,
393
+ "VerifySupportedOps": 1.700000029813964e-05,
394
+ "algsimp": 8.900000102585182e-05,
395
+ "batchnorm_expander": 1.700000029813964e-05,
396
+ "boundary-marker-removal": 6.000000212225132e-06,
397
+ "call-inliner": 1.2999999853491317e-05,
398
+ "canonicalize-boundary-marker": 7.999999979801942e-06,
399
+ "collective-stream-id-checker": 2.499999936844688e-05,
400
+ "comparison-expander": 6.000000212225132e-06,
401
+ "computation-deduplicator": 2.499999936844688e-05,
402
+ "config-lowering": 5.0999999075429514e-05,
403
+ "constant_folding": 1.2999999853491317e-05,
404
+ "cse": 1.9999999494757503e-05,
405
+ "dce": 1.9999999949504854e-06,
406
+ "dynamic-slice-transpose": 7.000000096013537e-06,
407
+ "eliminate-redundant-compare": 4.999999873689376e-06,
408
+ "emit-offloaded-dropout": 1.8999999156221747e-05,
409
+ "flatten-call-graph": 1.2000000424450263e-05,
410
+ "fuse-send-recv": 2.8000000384054147e-05,
411
+ "hilo-conditional-to-select": 7.999999979801942e-06,
412
+ "hilo::LegalizeAlias": 6.000000212225132e-06,
413
+ "hilo::NeuronInstCombine": 5.900000178371556e-05,
414
+ "hilo::NeuronOpFusion": 1.1000000085914508e-05,
415
+ "hilo::ReplaceTokenTypeWithU8Pass": 3.5000000934815034e-05,
416
+ "hilo::ScheduleFusion": 9.999999974752427e-07,
417
+ "hilo::SixtyFourHack": 1.8000000636675395e-05,
418
+ "hilo::VerifyAliasing": 3.000000106112566e-06,
419
+ "hlo-mac-count": 0.00014400000509340316,
420
+ "legalize-ccops-for-tensorizer": 3.000000106112566e-06,
421
+ "legalize-compare": 4.999999873689376e-06,
422
+ "lower-argminmax-custom-call": 4.999999873689376e-06,
423
+ "map-inline": 1.4000000192027073e-05,
424
+ "metadata-naming": 2.4000000848900527e-05,
425
+ "mlir::detail::OpToOpPassAdaptor": 1.9999999494757503e-05,
426
+ "mlir::hlo::MhloToPyPenguin": 0.0029299999587237835,
427
+ "mlir::mhlo::LowerComplexExtraPass": 7.200000254670158e-05,
428
+ "mlir::mhlo::LowerComplexPass": 0.00014200000441633165,
429
+ "native-to-custom-softmax": 7.999999979801942e-06,
430
+ "native-to-custom-softmax-dx": 0.0016329999780282378,
431
+ "neuron-hlo-verifier": 0.000598000013269484,
432
+ "operand_upcaster": 1.9999999494757503e-05,
433
+ "post-par-pipe-begin": 0.00034500000765547156,
434
+ "post-par-pipe-end": 0.0,
435
+ "post-partition-simplification": 0.0007699999841861427,
436
+ "replace-minimum-constant": 9.999999747378752e-06,
437
+ "reshape-mover": 4.999999873689376e-06,
438
+ "simplify-concat": 4.70000013592653e-05,
439
+ "simplify-while-loops": 3.000000106112566e-06,
440
+ "transform-variadic-reduce": 1.1000000085914508e-05,
441
+ "tuple-simplifier": 7.000000096013537e-06,
442
+ "unpack-nested-aws-ntwsr": 4.999999873689376e-06,
443
+ "unroll-while-loop": 9.999999974752427e-07
444
+ },
445
+ "hilo": {
446
+ "ArithmeticIntensity": 79.95455932617188,
447
+ "ConstantSize": 3678847.0,
448
+ "HloInputCount": 371.0,
449
+ "HloMacCount": 17179869184.0,
450
+ "HloOutputCount": 57.0,
451
+ "IfmapSize": 3910928384.0,
452
+ "OfmapSize": 1879048192.0,
453
+ "OutputsReadFromCount": 0.0,
454
+ "PassthroughTensorsCount": 0.0,
455
+ "RedundantOutputCount": 0.0,
456
+ "Traffic": 429740832.0
457
+ }
458
+ },
459
+ "sg0000": {
460
+ "compiletime": {
461
+ "AGOrderingAnalysisPass": 0.06203174591064453,
462
+ "AffinePredicateResolution": 0.001997709274291992,
463
+ "AliasDependencyElimination": 0.00024080276489257813,
464
+ "AliasDependencyInduction": 0.0331728458404541,
465
+ "AliasDependencyReset": 0.10205578804016113,
466
+ "BFComputeCutting": 0.007540702819824219,
467
+ "BirCodeGenLoop": 0.15983891487121582,
468
+ "CCOpFusion": 0.06544995307922363,
469
+ "CanonicalizeDAGForPGTiling": 0.004024982452392578,
470
+ "CanonicalizeIR": 0.001623392105102539,
471
+ "CoalesceCCOp": 0.011837482452392578,
472
+ "CommuteConcat": 0.009541988372802734,
473
+ "DMALocalityOpt": 0.0019822120666503906,
474
+ "DMAProfiler": 0.007272958755493164,
475
+ "DMATilingProfiler": 0.007293224334716797,
476
+ "DataLocalityOpt": 0.2593100070953369,
477
+ "DataStreaming": 0.0239105224609375,
478
+ "DeConcat": 0.005833864212036133,
479
+ "DeadCodeElimination": 0.00394749641418457,
480
+ "DeadStoreElimination": 0.07077240943908691,
481
+ "DelinearIndices": 0.02637171745300293,
482
+ "Delinearization": 0.01995396614074707,
483
+ "DelinearizeSPMD": 0.03704118728637695,
484
+ "DoNothing": 9.799003601074219e-05,
485
+ "DramToDramTranspose": 0.03482198715209961,
486
+ "DumpGraphAndMetadata": 0.01542520523071289,
487
+ "EliminateDivs": 0.005273103713989258,
488
+ "ExpandBatchNorm": 0.0026073455810546875,
489
+ "ExpandISAMacro": 0.008665800094604492,
490
+ "FactorizeBlkDims": 0.061437368392944336,
491
+ "FactorizeThreadAxesInFreeDims": 0.002484560012817383,
492
+ "FlattenMacroLoop": 0.008157968521118164,
493
+ "GenericAccessSimplifier": 0.0014643669128417969,
494
+ "InferInitValue": 0.08534860610961914,
495
+ "InferIntrinsicOnCC": 0.01716780662536621,
496
+ "InferNeuronTensor": 0.09510421752929688,
497
+ "InferNonlocalTensors": 0.16463732719421387,
498
+ "InferPSumTensor": 0.09516620635986328,
499
+ "InferShardAxis": 0.5436458587646484,
500
+ "InferSharedMemLoc": 0.013478994369506836,
501
+ "InlineNativeKernels": 0.0027844905853271484,
502
+ "InsertCoreBarrier": 0.008362293243408203,
503
+ "InsertIOTransposes": 0.07836699485778809,
504
+ "InsertImplicitShardAxisBeforeISel": 0.008057355880737305,
505
+ "InsertLocalTransposes": 0.01099085807800293,
506
+ "InsertOffloadedTransposes": 0.03647184371948242,
507
+ "LICM": 0.005979299545288086,
508
+ "LateLegalizeInst": 0.012919187545776367,
509
+ "LateLegalizePostSplit": 0.007997751235961914,
510
+ "LateLowerReshapeOp": 0.011852502822875977,
511
+ "LateLowerTensorOp": 0.007149696350097656,
512
+ "LateNeuronInstComb": 0.053853750228881836,
513
+ "LayoutPreprocessing": 0.07254910469055176,
514
+ "LayoutPreprocessingAndAnalysis": 0.13735532760620117,
515
+ "LayoutRequirementAnalysis": 0.012064695358276367,
516
+ "LegalizeCCOpLayout": 0.003309011459350586,
517
+ "LegalizeOpLevelAlias": 0.004944324493408203,
518
+ "LegalizePartitionReduce": 0.002275705337524414,
519
+ "LegalizeSundaAccess": 0.13529706001281738,
520
+ "LegalizeSundaMacro": 0.017252445220947266,
521
+ "LegalizeType": 0.007556915283203125,
522
+ "LocalLayoutOpt": 0.04438447952270508,
523
+ "LoopFusion": 0.018953561782836914,
524
+ "LoopSplitting": 0.0016851425170898438,
525
+ "LowerBroadcast": 0.005589485168457031,
526
+ "LowerCCOpBlockAxis": 0.009353399276733398,
527
+ "LowerComplexBroadcast": 0.011426210403442383,
528
+ "LowerIntrinsics": 0.04210019111633301,
529
+ "LowerShardAxis": 0.014751195907592773,
530
+ "LowerTensorOp": 0.02877187728881836,
531
+ "LowerToSendRecv": 0.006161689758300781,
532
+ "LowerTranspose": 0.02186894416809082,
533
+ "MacroGeneration": 0.1734302043914795,
534
+ "MaskPropagation": 0.014665842056274414,
535
+ "MemcpyElimination": 0.3008904457092285,
536
+ "MutateDataType": 0.0027010440826416016,
537
+ "NeuronAliasDependencyInduction": 0.0006909370422363281,
538
+ "NeuronAliasDependencyReset": 0.022809267044067383,
539
+ "NeuronInstComb": 0.005879402160644531,
540
+ "NeuronLICM": 0.0464015007019043,
541
+ "NeuronLoopFusion": 0.05638718605041504,
542
+ "NeuronLoopInterchange": 0.00871729850769043,
543
+ "NeuronSimplifier": 0.02101302146911621,
544
+ "NeuronSimplifyPredicates": 0.004530191421508789,
545
+ "NeuronValueNumbering": 0.007061004638671875,
546
+ "OptimizeAliasedCopyChain": 0.001558065414428711,
547
+ "OptimizeNKIKernels": 0.3715829849243164,
548
+ "PAGLayoutOpt": 0.648719310760498,
549
+ "PComputeCutting": 0.02423238754272461,
550
+ "PGLayoutTilingPipeline": 2.515984058380127,
551
+ "PGTiling": 0.46158504486083984,
552
+ "PadElimination": 0.0023555755615234375,
553
+ "ParAxesAnnotation": 0.5548486709594727,
554
+ "PartialLoopFusion": 0.04628252983093262,
555
+ "PartialSimdFusion": 0.06029558181762695,
556
+ "PerfectLoopNest": 0.0032892227172851563,
557
+ "RecognizeOpIdiom": 0.01747274398803711,
558
+ "Recompute": 0.00046896934509277344,
559
+ "RelaxPredicates": 0.00874948501586914,
560
+ "Rematerialization": 0.023741722106933594,
561
+ "RemoveShardedPartitionAxes": 0.041913747787475586,
562
+ "ReshapeWeights": 0.0023987293243408203,
563
+ "ResolveAccessConflict": 0.013326883316040039,
564
+ "ResolveComplicatePredicates": 0.0010704994201660156,
565
+ "RewriteReplicationMatmul": 0.00213623046875,
566
+ "RewriteWeights": 0.006081342697143555,
567
+ "SFKVectorizer": 0.5432095527648926,
568
+ "ShardingPropagationAnalysis": 0.04027843475341797,
569
+ "SimpleAllReduceTiling": 0.005087375640869141,
570
+ "Simplifier": 0.008136272430419922,
571
+ "SimplifyMacroPredicates": 0.010492086410522461,
572
+ "SimplifyNeuronTensor": 0.033696889877319336,
573
+ "SimplifySlice": 0.0016849040985107422,
574
+ "SimplifyTensor": 0.013016223907470703,
575
+ "SpillPSum": 0.04322075843811035,
576
+ "SplitAPUnionSets": 0.04480147361755371,
577
+ "SplitAccGrp": 0.0033092498779296875,
578
+ "StaticProfiler": 0.02093505859375,
579
+ "StaticTransposeLocalTensor": 0.011444330215454102,
580
+ "SundaISel": 0.0645599365234375,
581
+ "TCTransform": 0.0017342567443847656,
582
+ "TensorInitialization": 0.014005661010742188,
583
+ "TensorOpSimplifier": 0.010408163070678711,
584
+ "TensorOpTransform": 0.062005043029785156,
585
+ "TileCCOps": 0.007296085357666016,
586
+ "TilingProfiler": 0.04326295852661133,
587
+ "TransformConvOp": 0.004875659942626953,
588
+ "TritiumFusion": 0.12003302574157715,
589
+ "ValueNumbering": 0.007851839065551758,
590
+ "VectorizeDMA": 0.008031368255615234,
591
+ "VectorizeMatMult": 0.030368566513061523,
592
+ "WeightCoalescing": 0.009224891662597656,
593
+ "ZeroSizeTensorElimination": 0.0001709461212158203
594
+ },
595
+ "tensorizer": {
596
+ "DMATilingProfiler::TotalInstructionsAfterTiling": 3453.0,
597
+ "StaticProfiler::AifUb": 66.1578598022461,
598
+ "StaticProfiler::ArithmeticIntensityTensorizer": 256.2751770019531,
599
+ "StaticProfiler::AverageDmaLength": 1973.780029296875,
600
+ "StaticProfiler::AverageFractalPeUtilization": 99.81855773925781,
601
+ "StaticProfiler::AveragePartitionUtilization": 99.43334197998047,
602
+ "StaticProfiler::AveragePeUtilization": 99.31205749511719,
603
+ "StaticProfiler::DDRTransferBytes": 122882568.0,
604
+ "StaticProfiler::InternalTransferBytes": 87572480.0,
605
+ "StaticProfiler::LoadExpanded": 18965.0,
606
+ "StaticProfiler::LocalizationEfficiency": 387.36920166015625,
607
+ "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 656.1036376953125,
608
+ "StaticProfiler::StoreExpanded": 17921.0,
609
+ "StaticProfiler::TotalDMAExpanded": 36886.0,
610
+ "StaticProfiler::TotalDynamicInstancesCount": 4675.0,
611
+ "StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 4662.0,
612
+ "StaticProfiler::TotalLNCComm": 0.0,
613
+ "StaticProfiler::TotalLNCCommTransfer": 0.0,
614
+ "TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0,
615
+ "TilingProfiler::AveragePeUtilizationAfterTiling": 0.0,
616
+ "TilingProfiler::BatchnormInstructionsAfterTiling": 0.0,
617
+ "TilingProfiler::DmaInstructionsAfterTiling": 0.0,
618
+ "TilingProfiler::GenericInstructionsAfterTiling": 192.0,
619
+ "TilingProfiler::MatMultInstructionsAfterTiling": 1552.0,
620
+ "TilingProfiler::NumPfTransposes": 7.0,
621
+ "TilingProfiler::NumPfTransposesForIo": 1.0,
622
+ "TilingProfiler::NumPfTransposesForLocal": 3.0,
623
+ "TilingProfiler::NumPfTransposesForNonlocal": 3.0,
624
+ "TilingProfiler::PfTransposeInstructions": 896.0,
625
+ "TilingProfiler::PfTransposeInstructionsForIo": 256.0,
626
+ "TilingProfiler::PfTransposeInstructionsForLocal": 256.0,
627
+ "TilingProfiler::PfTransposeInstructionsForNonlocal": 384.0,
628
+ "TilingProfiler::ReduceInstructionsAfterTiling": 0.0,
629
+ "TilingProfiler::SimdInstructionsAfterTiling": 468.0,
630
+ "TilingProfiler::TotalInstructionsAfterTiling": 0.0,
631
+ "TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0,
632
+ "TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0,
633
+ "TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0,
634
+ "TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0,
635
+ "TransformConvOp::conv2d_column_packing": 0.0,
636
+ "TransformConvOp::conv2d_column_packing_1": 0.0,
637
+ "TransformConvOp::conv2d_column_packing_io10": 0.0,
638
+ "TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0
639
+ }
640
+ },
641
+ "sg0001": {
642
+ "compiletime": {
643
+ "AGOrderingAnalysisPass": 0.10222506523132324,
644
+ "AffinePredicateResolution": 0.002437591552734375,
645
+ "AliasDependencyElimination": 0.00020074844360351563,
646
+ "AliasDependencyInduction": 0.030005455017089844,
647
+ "AliasDependencyReset": 0.08542060852050781,
648
+ "BFComputeCutting": 0.009021759033203125,
649
+ "BirCodeGenLoop": 0.0576014518737793,
650
+ "CCOpFusion": 0.07059645652770996,
651
+ "CanonicalizeDAGForPGTiling": 0.011131525039672852,
652
+ "CanonicalizeIR": 0.0030748844146728516,
653
+ "CoalesceCCOp": 0.016925573348999023,
654
+ "CommuteConcat": 0.004233837127685547,
655
+ "DMALocalityOpt": 0.0022597312927246094,
656
+ "DMAProfiler": 0.011726617813110352,
657
+ "DMATilingProfiler": 0.010080099105834961,
658
+ "DataLocalityOpt": 0.45432257652282715,
659
+ "DataStreaming": 0.007066249847412109,
660
+ "DeConcat": 0.010270833969116211,
661
+ "DeadCodeElimination": 0.003401517868041992,
662
+ "DeadStoreElimination": 0.08969426155090332,
663
+ "DelinearIndices": 0.020795345306396484,
664
+ "Delinearization": 0.006405353546142578,
665
+ "DelinearizeSPMD": 0.031574249267578125,
666
+ "DoNothing": 0.00010728836059570313,
667
+ "DramToDramTranspose": 0.021518468856811523,
668
+ "DumpGraphAndMetadata": 0.00677490234375,
669
+ "EliminateDivs": 0.0029458999633789063,
670
+ "ExpandBatchNorm": 0.003565549850463867,
671
+ "ExpandISAMacro": 0.006104230880737305,
672
+ "FactorizeBlkDims": 0.03833317756652832,
673
+ "FactorizeThreadAxesInFreeDims": 0.007614850997924805,
674
+ "FlattenMacroLoop": 0.01127004623413086,
675
+ "GenericAccessSimplifier": 0.0043070316314697266,
676
+ "InferInitValue": 0.06825661659240723,
677
+ "InferIntrinsicOnCC": 0.046250104904174805,
678
+ "InferNeuronTensor": 0.09652161598205566,
679
+ "InferNonlocalTensors": 0.08535599708557129,
680
+ "InferPSumTensor": 0.08618307113647461,
681
+ "InferShardAxis": 0.6054186820983887,
682
+ "InferSharedMemLoc": 0.007490873336791992,
683
+ "InlineNativeKernels": 0.0046694278717041016,
684
+ "InsertCoreBarrier": 0.00831913948059082,
685
+ "InsertIOTransposes": 0.07386589050292969,
686
+ "InsertImplicitShardAxisBeforeISel": 0.012522697448730469,
687
+ "InsertLocalTransposes": 0.018398761749267578,
688
+ "InsertOffloadedTransposes": 0.03478860855102539,
689
+ "LICM": 0.006189107894897461,
690
+ "LateLegalizeInst": 0.018419742584228516,
691
+ "LateLegalizePostSplit": 0.011380195617675781,
692
+ "LateLowerReshapeOp": 0.006206035614013672,
693
+ "LateLowerTensorOp": 0.006627559661865234,
694
+ "LateNeuronInstComb": 0.013695240020751953,
695
+ "LayoutPreprocessing": 0.08205723762512207,
696
+ "LayoutPreprocessingAndAnalysis": 0.3778700828552246,
697
+ "LayoutRequirementAnalysis": 0.027397871017456055,
698
+ "LegalizeCCOpLayout": 0.004743337631225586,
699
+ "LegalizeOpLevelAlias": 0.001989126205444336,
700
+ "LegalizePartitionReduce": 0.003030061721801758,
701
+ "LegalizeSundaAccess": 0.026180505752563477,
702
+ "LegalizeSundaMacro": 0.02354145050048828,
703
+ "LegalizeType": 0.012012004852294922,
704
+ "LocalLayoutOpt": 0.09747910499572754,
705
+ "LoopFusion": 0.011905670166015625,
706
+ "LoopSplitting": 0.005662441253662109,
707
+ "LowerBroadcast": 0.0031082630157470703,
708
+ "LowerCCOpBlockAxis": 0.015021800994873047,
709
+ "LowerComplexBroadcast": 0.004594564437866211,
710
+ "LowerIntrinsics": 0.061724185943603516,
711
+ "LowerShardAxis": 0.01390695571899414,
712
+ "LowerTensorOp": 0.032297372817993164,
713
+ "LowerToSendRecv": 0.005787849426269531,
714
+ "LowerTranspose": 0.014832496643066406,
715
+ "MacroGeneration": 0.17066407203674316,
716
+ "MaskPropagation": 0.004767894744873047,
717
+ "MemcpyElimination": 0.3223605155944824,
718
+ "MutateDataType": 0.0023605823516845703,
719
+ "NeuronAliasDependencyInduction": 0.0017361640930175781,
720
+ "NeuronAliasDependencyReset": 0.02784562110900879,
721
+ "NeuronInstComb": 0.008632659912109375,
722
+ "NeuronLICM": 0.01805901527404785,
723
+ "NeuronLoopFusion": 0.041216135025024414,
724
+ "NeuronLoopInterchange": 0.0041141510009765625,
725
+ "NeuronSimplifier": 0.025291919708251953,
726
+ "NeuronSimplifyPredicates": 0.007104635238647461,
727
+ "NeuronValueNumbering": 0.0058324337005615234,
728
+ "OptimizeAliasedCopyChain": 0.0016317367553710938,
729
+ "OptimizeNKIKernels": 0.4839596748352051,
730
+ "PAGLayoutOpt": 0.3772914409637451,
731
+ "PComputeCutting": 0.03927016258239746,
732
+ "PGLayoutTilingPipeline": 2.7096974849700928,
733
+ "PGTiling": 0.5330896377563477,
734
+ "PadElimination": 0.0010271072387695313,
735
+ "ParAxesAnnotation": 0.32303333282470703,
736
+ "PartialLoopFusion": 0.05098128318786621,
737
+ "PartialSimdFusion": 0.10409116744995117,
738
+ "PerfectLoopNest": 0.008025884628295898,
739
+ "RecognizeOpIdiom": 0.014155864715576172,
740
+ "Recompute": 0.0006039142608642578,
741
+ "RelaxPredicates": 0.007999897003173828,
742
+ "Rematerialization": 0.0150146484375,
743
+ "RemoveShardedPartitionAxes": 0.04702639579772949,
744
+ "ReshapeWeights": 0.0015103816986083984,
745
+ "ResolveAccessConflict": 0.0074825286865234375,
746
+ "ResolveComplicatePredicates": 0.002012014389038086,
747
+ "RewriteReplicationMatmul": 0.002730846405029297,
748
+ "RewriteWeights": 0.01182103157043457,
749
+ "SFKVectorizer": 0.4407639503479004,
750
+ "ShardingPropagationAnalysis": 0.029230833053588867,
751
+ "SimpleAllReduceTiling": 0.005069255828857422,
752
+ "Simplifier": 0.020698070526123047,
753
+ "SimplifyMacroPredicates": 0.021116018295288086,
754
+ "SimplifyNeuronTensor": 0.012060403823852539,
755
+ "SimplifySlice": 0.0015597343444824219,
756
+ "SimplifyTensor": 0.014514684677124023,
757
+ "SpillPSum": 0.048569679260253906,
758
+ "SplitAPUnionSets": 0.05286097526550293,
759
+ "SplitAccGrp": 0.002934694290161133,
760
+ "StaticProfiler": 0.013947248458862305,
761
+ "StaticTransposeLocalTensor": 0.00755763053894043,
762
+ "SundaISel": 0.06808805465698242,
763
+ "TCTransform": 0.0025751590728759766,
764
+ "TensorInitialization": 0.005185127258300781,
765
+ "TensorOpSimplifier": 0.024057626724243164,
766
+ "TensorOpTransform": 0.06213688850402832,
767
+ "TileCCOps": 0.025543689727783203,
768
+ "TilingProfiler": 0.02153778076171875,
769
+ "TransformConvOp": 0.007241010665893555,
770
+ "TritiumFusion": 0.1687297821044922,
771
+ "ValueNumbering": 0.009909868240356445,
772
+ "VectorizeDMA": 0.008072137832641602,
773
+ "VectorizeMatMult": 0.042955636978149414,
774
+ "WeightCoalescing": 0.003875732421875,
775
+ "ZeroSizeTensorElimination": 0.00020575523376464844
776
+ },
777
+ "tensorizer": {
778
+ "DMATilingProfiler::TotalInstructionsAfterTiling": 8283.0,
779
+ "StaticProfiler::AifUb": 502.6534729003906,
780
+ "StaticProfiler::ArithmeticIntensityTensorizer": 413.67962646484375,
781
+ "StaticProfiler::AverageDmaLength": 2481.933349609375,
782
+ "StaticProfiler::AverageFractalPeUtilization": 100.0,
783
+ "StaticProfiler::AveragePartitionUtilization": 99.62867736816406,
784
+ "StaticProfiler::AveragePeUtilization": 100.0,
785
+ "StaticProfiler::DDRTransferBytes": 266536960.0,
786
+ "StaticProfiler::InternalTransferBytes": 79167488.0,
787
+ "StaticProfiler::LoadExpanded": 71809.0,
788
+ "StaticProfiler::LocalizationEfficiency": 82.29916381835938,
789
+ "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 103.46524047851563,
790
+ "StaticProfiler::StoreExpanded": 18433.0,
791
+ "StaticProfiler::TotalDMAExpanded": 90242.0,
792
+ "StaticProfiler::TotalDynamicInstancesCount": 9699.0,
793
+ "StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 9699.0,
794
+ "StaticProfiler::TotalLNCComm": 0.0,
795
+ "StaticProfiler::TotalLNCCommTransfer": 0.0,
796
+ "TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0,
797
+ "TilingProfiler::AveragePeUtilizationAfterTiling": 0.0,
798
+ "TilingProfiler::BatchnormInstructionsAfterTiling": 0.0,
799
+ "TilingProfiler::DmaInstructionsAfterTiling": 0.0,
800
+ "TilingProfiler::GenericInstructionsAfterTiling": 128.0,
801
+ "TilingProfiler::MatMultInstructionsAfterTiling": 6144.0,
802
+ "TilingProfiler::NumPfTransposes": 8.0,
803
+ "TilingProfiler::NumPfTransposesForIo": 3.0,
804
+ "TilingProfiler::NumPfTransposesForLocal": 3.0,
805
+ "TilingProfiler::NumPfTransposesForNonlocal": 2.0,
806
+ "TilingProfiler::PfTransposeInstructions": 992.0,
807
+ "TilingProfiler::PfTransposeInstructionsForIo": 288.0,
808
+ "TilingProfiler::PfTransposeInstructionsForLocal": 192.0,
809
+ "TilingProfiler::PfTransposeInstructionsForNonlocal": 512.0,
810
+ "TilingProfiler::ReduceInstructionsAfterTiling": 0.0,
811
+ "TilingProfiler::SimdInstructionsAfterTiling": 547.0,
812
+ "TilingProfiler::TotalInstructionsAfterTiling": 0.0,
813
+ "TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0,
814
+ "TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0,
815
+ "TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0,
816
+ "TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0,
817
+ "TransformConvOp::conv2d_column_packing": 0.0,
818
+ "TransformConvOp::conv2d_column_packing_1": 0.0,
819
+ "TransformConvOp::conv2d_column_packing_io10": 0.0,
820
+ "TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0
821
+ }
822
+ },
823
+ "sg0002": {
824
+ "compiletime": {
825
+ "AGOrderingAnalysisPass": 0.04760026931762695,
826
+ "AffinePredicateResolution": 0.003319978713989258,
827
+ "AliasDependencyElimination": 0.0002167224884033203,
828
+ "AliasDependencyInduction": 0.008548259735107422,
829
+ "AliasDependencyReset": 0.03149843215942383,
830
+ "BFComputeCutting": 0.00810694694519043,
831
+ "BirCodeGenLoop": 0.2911098003387451,
832
+ "CCOpFusion": 0.08548593521118164,
833
+ "CanonicalizeDAGForPGTiling": 0.007600545883178711,
834
+ "CanonicalizeIR": 0.0030400753021240234,
835
+ "CoalesceCCOp": 0.008062601089477539,
836
+ "CommuteConcat": 0.007961034774780273,
837
+ "DMALocalityOpt": 0.002327442169189453,
838
+ "DMAProfiler": 0.009556293487548828,
839
+ "DMATilingProfiler": 0.009016752243041992,
840
+ "DataLocalityOpt": 0.17029356956481934,
841
+ "DataStreaming": 0.007345914840698242,
842
+ "DeConcat": 0.01120138168334961,
843
+ "DeadCodeElimination": 0.010882377624511719,
844
+ "DeadStoreElimination": 0.010195016860961914,
845
+ "DelinearIndices": 0.010077953338623047,
846
+ "Delinearization": 0.011870861053466797,
847
+ "DelinearizeSPMD": 0.035944223403930664,
848
+ "DoNothing": 0.0001087188720703125,
849
+ "DramToDramTranspose": 0.013046979904174805,
850
+ "DumpGraphAndMetadata": 0.03416705131530762,
851
+ "EliminateDivs": 0.004259586334228516,
852
+ "ExpandBatchNorm": 0.0017371177673339844,
853
+ "ExpandISAMacro": 0.0058269500732421875,
854
+ "FactorizeBlkDims": 0.03687334060668945,
855
+ "FactorizeThreadAxesInFreeDims": 0.00911855697631836,
856
+ "FlattenMacroLoop": 0.0048520565032958984,
857
+ "GenericAccessSimplifier": 0.001367330551147461,
858
+ "InferInitValue": 0.0836641788482666,
859
+ "InferIntrinsicOnCC": 0.008740901947021484,
860
+ "InferNeuronTensor": 0.05709338188171387,
861
+ "InferNonlocalTensors": 0.041548728942871094,
862
+ "InferPSumTensor": 0.05230545997619629,
863
+ "InferShardAxis": 0.5781030654907227,
864
+ "InferSharedMemLoc": 0.026081323623657227,
865
+ "InlineNativeKernels": 0.002477407455444336,
866
+ "InsertCoreBarrier": 0.008142948150634766,
867
+ "InsertIOTransposes": 0.039937734603881836,
868
+ "InsertImplicitShardAxisBeforeISel": 0.013466596603393555,
869
+ "InsertLocalTransposes": 0.018125534057617188,
870
+ "InsertOffloadedTransposes": 0.014874696731567383,
871
+ "LICM": 0.0058231353759765625,
872
+ "LateLegalizeInst": 0.01174783706665039,
873
+ "LateLegalizePostSplit": 0.02429652214050293,
874
+ "LateLowerReshapeOp": 0.0018832683563232422,
875
+ "LateLowerTensorOp": 0.0021920204162597656,
876
+ "LateNeuronInstComb": 0.043119192123413086,
877
+ "LayoutPreprocessing": 0.06973385810852051,
878
+ "LayoutPreprocessingAndAnalysis": 0.11140203475952148,
879
+ "LayoutRequirementAnalysis": 0.013022661209106445,
880
+ "LegalizeCCOpLayout": 0.0020427703857421875,
881
+ "LegalizeOpLevelAlias": 0.0016918182373046875,
882
+ "LegalizePartitionReduce": 0.0030241012573242188,
883
+ "LegalizeSundaAccess": 0.045601606369018555,
884
+ "LegalizeSundaMacro": 0.02708148956298828,
885
+ "LegalizeType": 0.014174222946166992,
886
+ "LocalLayoutOpt": 0.022045135498046875,
887
+ "LoopFusion": 0.029404163360595703,
888
+ "LoopSplitting": 0.0007355213165283203,
889
+ "LowerBroadcast": 0.005047321319580078,
890
+ "LowerCCOpBlockAxis": 0.007714748382568359,
891
+ "LowerComplexBroadcast": 0.005654096603393555,
892
+ "LowerIntrinsics": 0.04253792762756348,
893
+ "LowerShardAxis": 0.03305673599243164,
894
+ "LowerTensorOp": 0.028458356857299805,
895
+ "LowerToSendRecv": 0.03391242027282715,
896
+ "LowerTranspose": 0.04655814170837402,
897
+ "MacroGeneration": 0.06428074836730957,
898
+ "MaskPropagation": 0.0036263465881347656,
899
+ "MemcpyElimination": 0.05451250076293945,
900
+ "MutateDataType": 0.001516103744506836,
901
+ "NeuronAliasDependencyInduction": 0.0005834102630615234,
902
+ "NeuronAliasDependencyReset": 0.022034168243408203,
903
+ "NeuronInstComb": 0.04628133773803711,
904
+ "NeuronLICM": 0.026567935943603516,
905
+ "NeuronLoopFusion": 0.07339620590209961,
906
+ "NeuronLoopInterchange": 0.0027348995208740234,
907
+ "NeuronSimplifier": 0.021918296813964844,
908
+ "NeuronSimplifyPredicates": 0.014072179794311523,
909
+ "NeuronValueNumbering": 0.013863325119018555,
910
+ "OptimizeAliasedCopyChain": 0.0008976459503173828,
911
+ "OptimizeNKIKernels": 4.611967086791992,
912
+ "PAGLayoutOpt": 0.2917053699493408,
913
+ "PComputeCutting": 0.008776664733886719,
914
+ "PGLayoutTilingPipeline": 1.8517823219299316,
915
+ "PGTiling": 0.26313185691833496,
916
+ "PadElimination": 0.0006458759307861328,
917
+ "ParAxesAnnotation": 0.188338041305542,
918
+ "PartialLoopFusion": 0.05682229995727539,
919
+ "PartialSimdFusion": 0.0237729549407959,
920
+ "PerfectLoopNest": 0.00557398796081543,
921
+ "RecognizeOpIdiom": 0.008669376373291016,
922
+ "Recompute": 0.0005908012390136719,
923
+ "RelaxPredicates": 0.006473541259765625,
924
+ "Rematerialization": 0.011237144470214844,
925
+ "RemoveShardedPartitionAxes": 0.014671802520751953,
926
+ "ReshapeWeights": 0.0018546581268310547,
927
+ "ResolveAccessConflict": 0.008959770202636719,
928
+ "ResolveComplicatePredicates": 0.0009264945983886719,
929
+ "RewriteReplicationMatmul": 0.0037200450897216797,
930
+ "RewriteWeights": 0.008005380630493164,
931
+ "SFKVectorizer": 0.1923050880432129,
932
+ "ShardingPropagationAnalysis": 0.10689902305603027,
933
+ "SimpleAllReduceTiling": 0.003542184829711914,
934
+ "Simplifier": 0.00808858871459961,
935
+ "SimplifyMacroPredicates": 0.031823158264160156,
936
+ "SimplifyNeuronTensor": 0.013367414474487305,
937
+ "SimplifySlice": 0.001531362533569336,
938
+ "SimplifyTensor": 0.018309593200683594,
939
+ "SpillPSum": 0.03448653221130371,
940
+ "SplitAPUnionSets": 0.09693408012390137,
941
+ "SplitAccGrp": 0.0025701522827148438,
942
+ "StaticProfiler": 0.04053521156311035,
943
+ "StaticTransposeLocalTensor": 0.012635231018066406,
944
+ "SundaISel": 0.10333561897277832,
945
+ "TCTransform": 0.006776332855224609,
946
+ "TensorInitialization": 0.011014938354492188,
947
+ "TensorOpSimplifier": 0.005452632904052734,
948
+ "TensorOpTransform": 0.033481597900390625,
949
+ "TileCCOps": 0.011636972427368164,
950
+ "TilingProfiler": 0.024947643280029297,
951
+ "TransformConvOp": 0.013001441955566406,
952
+ "TritiumFusion": 0.1458723545074463,
953
+ "ValueNumbering": 0.003311634063720703,
954
+ "VectorizeDMA": 0.005986928939819336,
955
+ "VectorizeMatMult": 0.028806686401367188,
956
+ "WeightCoalescing": 0.007086515426635742,
957
+ "ZeroSizeTensorElimination": 0.00017833709716796875
958
+ },
959
+ "tensorizer": {
960
+ "DMATilingProfiler::TotalInstructionsAfterTiling": 25519.0,
961
+ "StaticProfiler::AifUb": 337.1839904785156,
962
+ "StaticProfiler::ArithmeticIntensityTensorizer": 248.63792419433594,
963
+ "StaticProfiler::AverageDmaLength": 2413.602294921875,
964
+ "StaticProfiler::AverageFractalPeUtilization": 98.93502807617188,
965
+ "StaticProfiler::AveragePartitionUtilization": 95.0970230102539,
966
+ "StaticProfiler::AveragePeUtilization": 97.18069458007813,
967
+ "StaticProfiler::DDRTransferBytes": 495991840.0,
968
+ "StaticProfiler::InternalTransferBytes": 361682720.0,
969
+ "StaticProfiler::LoadExpanded": 133728.0,
970
+ "StaticProfiler::LocalizationEfficiency": 73.73954010009766,
971
+ "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 79.92718505859375,
972
+ "StaticProfiler::StoreExpanded": 7530.0,
973
+ "StaticProfiler::TotalDMAExpanded": 141258.0,
974
+ "StaticProfiler::TotalDynamicInstancesCount": 30781.0,
975
+ "StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 30330.0,
976
+ "StaticProfiler::TotalLNCComm": 0.0,
977
+ "StaticProfiler::TotalLNCCommTransfer": 0.0,
978
+ "TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0,
979
+ "TilingProfiler::AveragePeUtilizationAfterTiling": 0.0,
980
+ "TilingProfiler::BatchnormInstructionsAfterTiling": 0.0,
981
+ "TilingProfiler::DmaInstructionsAfterTiling": 0.0,
982
+ "TilingProfiler::GenericInstructionsAfterTiling": 4.0,
983
+ "TilingProfiler::MatMultInstructionsAfterTiling": 14112.0,
984
+ "TilingProfiler::NumPfTransposes": 5.0,
985
+ "TilingProfiler::NumPfTransposesForIo": 1.0,
986
+ "TilingProfiler::NumPfTransposesForLocal": 1.0,
987
+ "TilingProfiler::NumPfTransposesForNonlocal": 3.0,
988
+ "TilingProfiler::PfTransposeInstructions": 10273.0,
989
+ "TilingProfiler::PfTransposeInstructionsForIo": 9504.0,
990
+ "TilingProfiler::PfTransposeInstructionsForLocal": 1.0,
991
+ "TilingProfiler::PfTransposeInstructionsForNonlocal": 768.0,
992
+ "TilingProfiler::ReduceInstructionsAfterTiling": 10.0,
993
+ "TilingProfiler::SimdInstructionsAfterTiling": 311.0,
994
+ "TilingProfiler::TotalInstructionsAfterTiling": 0.0,
995
+ "TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0,
996
+ "TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0,
997
+ "TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0,
998
+ "TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0,
999
+ "TransformConvOp::conv2d_column_packing": 0.0,
1000
+ "TransformConvOp::conv2d_column_packing_1": 0.0,
1001
+ "TransformConvOp::conv2d_column_packing_io10": 0.0,
1002
+ "TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0
1003
+ }
1004
+ },
1005
+ "sg01": {
1006
+ "compiletime": {
1007
+ "CanonicalizeConv": 1.8000000636675395e-05,
1008
+ "CanonicalizeForTensorizer": 1.8000000636675395e-05,
1009
+ "Canonicalizer": 0.0003330000035930425,
1010
+ "HoistCompute": 3.000000106112566e-06,
1011
+ "IdentifyCrossPassTensors": 1.5999999959603883e-05,
1012
+ "MemcastMotion": 7.000000096013537e-06,
1013
+ "PenguinizeFunctions": 1.8000000636675395e-05,
1014
+ "PruneFunctions": 1.8000000636675395e-05,
1015
+ "RemoveOptimizationBarriers": 2.4000000848900527e-05,
1016
+ "ScatterMotion": 7.000000096013537e-06,
1017
+ "TensorizerLegalizationPass": 2.300000051036477e-05,
1018
+ "VerifySupportedOps": 1.5999999959603883e-05,
1019
+ "algsimp": 9.899999713525176e-05,
1020
+ "batchnorm_expander": 1.5999999959603883e-05,
1021
+ "boundary-marker-removal": 7.000000096013537e-06,
1022
+ "call-inliner": 1.4000000192027073e-05,
1023
+ "canonicalize-boundary-marker": 7.999999979801942e-06,
1024
+ "collective-stream-id-checker": 3.999999989900971e-06,
1025
+ "comparison-expander": 7.999999979801942e-06,
1026
+ "computation-deduplicator": 2.700000004551839e-05,
1027
+ "config-lowering": 4.999999873689376e-05,
1028
+ "constant_folding": 1.4000000192027073e-05,
1029
+ "cse": 1.8000000636675395e-05,
1030
+ "dce": 9.999999974752427e-07,
1031
+ "dynamic-slice-transpose": 6.000000212225132e-06,
1032
+ "eliminate-redundant-compare": 4.999999873689376e-06,
1033
+ "emit-offloaded-dropout": 1.5999999959603883e-05,
1034
+ "flatten-call-graph": 1.1000000085914508e-05,
1035
+ "fuse-send-recv": 2.9000000722589903e-05,
1036
+ "hilo-conditional-to-select": 9.000000318337698e-06,
1037
+ "hilo::LegalizeAlias": 6.000000212225132e-06,
1038
+ "hilo::NeuronInstCombine": 5.400000009103678e-05,
1039
+ "hilo::NeuronOpFusion": 1.2000000424450263e-05,
1040
+ "hilo::ReplaceTokenTypeWithU8Pass": 1.8000000636675395e-05,
1041
+ "hilo::ScheduleFusion": 9.999999974752427e-07,
1042
+ "hilo::SixtyFourHack": 1.5999999959603883e-05,
1043
+ "hilo::VerifyAliasing": 1.9999999949504854e-06,
1044
+ "hlo-mac-count": 0.00012700000661425292,
1045
+ "legalize-ccops-for-tensorizer": 9.999999974752427e-07,
1046
+ "legalize-compare": 4.999999873689376e-06,
1047
+ "lower-argminmax-custom-call": 4.999999873689376e-06,
1048
+ "map-inline": 1.4000000192027073e-05,
1049
+ "metadata-naming": 2.9000000722589903e-05,
1050
+ "mlir::detail::OpToOpPassAdaptor": 2.499999936844688e-05,
1051
+ "mlir::hlo::MhloToPyPenguin": 0.0017209999496117234,
1052
+ "mlir::mhlo::LowerComplexExtraPass": 7.200000254670158e-05,
1053
+ "mlir::mhlo::LowerComplexPass": 0.00014099999680183828,
1054
+ "native-to-custom-softmax": 7.000000096013537e-06,
1055
+ "native-to-custom-softmax-dx": 2.300000051036477e-05,
1056
+ "neuron-hlo-verifier": 0.0005729999975301325,
1057
+ "operand_upcaster": 1.8999999156221747e-05,
1058
+ "post-par-pipe-begin": 9.999999974752427e-07,
1059
+ "post-par-pipe-end": 0.0,
1060
+ "post-partition-simplification": 0.0007699999841861427,
1061
+ "replace-minimum-constant": 9.000000318337698e-06,
1062
+ "reshape-mover": 3.999999989900971e-06,
1063
+ "simplify-concat": 4.8999998398358e-05,
1064
+ "simplify-while-loops": 3.000000106112566e-06,
1065
+ "transform-variadic-reduce": 1.1000000085914508e-05,
1066
+ "tuple-simplifier": 7.000000096013537e-06,
1067
+ "unpack-nested-aws-ntwsr": 4.999999873689376e-06,
1068
+ "unroll-while-loop": 2.099999983329326e-05
1069
+ },
1070
+ "hilo": {
1071
+ "ArithmeticIntensity": 661.1749267578125,
1072
+ "HloMacCount": 55834574848.0,
1073
+ "Traffic": 168895008.0
1074
+ }
1075
+ },
1076
+ "sg02": {
1077
+ "compiletime": {
1078
+ "CanonicalizeConv": 7.000000096013537e-06,
1079
+ "CanonicalizeForTensorizer": 1.9999999494757503e-05,
1080
+ "Canonicalizer": 0.0004039999912492931,
1081
+ "HoistCompute": 0.0,
1082
+ "IdentifyCrossPassTensors": 1.9999999494757503e-05,
1083
+ "MemcastMotion": 0.0,
1084
+ "PenguinizeFunctions": 1.8000000636675395e-05,
1085
+ "PruneFunctions": 9.999999747378752e-06,
1086
+ "RemoveOptimizationBarriers": 2.499999936844688e-05,
1087
+ "ScatterMotion": 1.9999999949504854e-06,
1088
+ "TensorizerLegalizationPass": 9.000000318337698e-06,
1089
+ "VerifySupportedOps": 1.8000000636675395e-05,
1090
+ "algsimp": 0.00012599999899975955,
1091
+ "batchnorm_expander": 1.700000029813964e-05,
1092
+ "boundary-marker-removal": 6.000000212225132e-06,
1093
+ "call-inliner": 1.8999999156221747e-05,
1094
+ "canonicalize-boundary-marker": 7.000000096013537e-06,
1095
+ "collective-stream-id-checker": 3.999999989900971e-06,
1096
+ "comparison-expander": 2.9999999242136255e-05,
1097
+ "computation-deduplicator": 3.199999991920777e-05,
1098
+ "config-lowering": 6.500000017695129e-05,
1099
+ "constant_folding": 1.700000029813964e-05,
1100
+ "cse": 2.499999936844688e-05,
1101
+ "dce": 1.9999999949504854e-06,
1102
+ "dynamic-slice-transpose": 7.000000096013537e-06,
1103
+ "eliminate-redundant-compare": 4.999999873689376e-06,
1104
+ "emit-offloaded-dropout": 1.9999999494757503e-05,
1105
+ "flatten-call-graph": 1.700000029813964e-05,
1106
+ "fuse-send-recv": 3.899999865097925e-05,
1107
+ "hilo-conditional-to-select": 1.2000000424450263e-05,
1108
+ "hilo::LegalizeAlias": 3.000000106112566e-06,
1109
+ "hilo::NeuronInstCombine": 1.4000000192027073e-05,
1110
+ "hilo::NeuronOpFusion": 1.8000000636675395e-05,
1111
+ "hilo::ReplaceTokenTypeWithU8Pass": 2.099999983329326e-05,
1112
+ "hilo::ScheduleFusion": 9.999999974752427e-07,
1113
+ "hilo::SixtyFourHack": 6.199999916134402e-05,
1114
+ "hilo::VerifyAliasing": 9.999999974752427e-07,
1115
+ "hlo-mac-count": 0.007534000091254711,
1116
+ "legalize-ccops-for-tensorizer": 1.9999999949504854e-06,
1117
+ "legalize-compare": 3.999999989900971e-06,
1118
+ "lower-argminmax-custom-call": 6.000000212225132e-06,
1119
+ "map-inline": 1.5999999959603883e-05,
1120
+ "metadata-naming": 2.499999936844688e-05,
1121
+ "mlir::detail::OpToOpPassAdaptor": 1.2999999853491317e-05,
1122
+ "mlir::hlo::MhloToPyPenguin": 0.008725999854505062,
1123
+ "mlir::mhlo::LowerComplexExtraPass": 7.899999764049426e-05,
1124
+ "mlir::mhlo::LowerComplexPass": 0.0001320000010309741,
1125
+ "native-to-custom-softmax": 6.000000212225132e-06,
1126
+ "native-to-custom-softmax-dx": 2.5999999706982635e-05,
1127
+ "neuron-hlo-verifier": 0.0005200000014156103,
1128
+ "operand_upcaster": 1.700000029813964e-05,
1129
+ "post-par-pipe-begin": 1.9999999949504854e-06,
1130
+ "post-par-pipe-end": 0.0,
1131
+ "post-partition-simplification": 0.00076299998909235,
1132
+ "replace-minimum-constant": 1.4999999621068127e-05,
1133
+ "reshape-mover": 4.999999873689376e-06,
1134
+ "simplify-concat": 5.199999941396527e-05,
1135
+ "simplify-while-loops": 3.999999989900971e-06,
1136
+ "transform-variadic-reduce": 7.100000220816582e-05,
1137
+ "tuple-simplifier": 7.999999979801942e-06,
1138
+ "unpack-nested-aws-ntwsr": 6.000000212225132e-06,
1139
+ "unroll-while-loop": 9.999999974752427e-07
1140
+ },
1141
+ "hilo": {
1142
+ "ArithmeticIntensity": 207.31654357910156,
1143
+ "HloMacCount": 38811336704.0,
1144
+ "Traffic": 374416192.0
1145
+ }
1146
+ },
1147
+ "topk": {
1148
+ "compiletime": {
1149
+ "CoalesceCCOp": 0.012049198150634766,
1150
+ "DMALocalityOpt": 0.013970613479614258,
1151
+ "DMAProfiler": 0.007668733596801758,
1152
+ "DataStreaming": 0.031991004943847656,
1153
+ "DoNothing": 0.0002658367156982422,
1154
+ "ExpandISAMacro": 0.007843732833862305,
1155
+ "FactorizeBlkDims": 0.03324699401855469,
1156
+ "InferPSumTensor": 0.17986130714416504,
1157
+ "InferSharedMemLoc": 0.0050508975982666016,
1158
+ "InsertCoreBarrier": 0.0073986053466796875,
1159
+ "LateLegalizeInst": 0.02333354949951172,
1160
+ "LateNeuronInstComb": 0.01965785026550293,
1161
+ "LegalizeSundaAccess": 0.0358271598815918,
1162
+ "LegalizeType": 0.026246309280395508,
1163
+ "LowerBroadcast": 0.023288965225219727,
1164
+ "LowerIntrinsics": 0.008131980895996094,
1165
+ "LowerTranspose": 0.004733085632324219,
1166
+ "NeuronInstComb": 0.01128530502319336,
1167
+ "NeuronLICM": 0.027612686157226563,
1168
+ "NeuronSimplifyPredicates": 0.006512641906738281,
1169
+ "NeuronValueNumbering": 0.007419109344482422,
1170
+ "SFKVectorizer": 0.09510302543640137,
1171
+ "SimpleAllReduceTiling": 0.0070476531982421875,
1172
+ "SimplifyNeuronTensor": 0.11079812049865723,
1173
+ "SpillPSum": 0.058808088302612305,
1174
+ "WeightCoalescing": 0.0070688724517822266
1175
+ }
1176
+ }
1177
+ }
context_encoding_model/_tp0_bk4/graph.neff ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:83bfd67384e1a0c5645609060b8bfb6fc5cfe3dbbd75b7568508606e623f387d
3
+ size 1926144
context_encoding_model/_tp0_bk4/log-neuron-cc.txt ADDED
The diff for this file is too large to render. See raw diff
 
context_encoding_model/_tp0_bk4/metaneff.pb ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f2c9cfa0cd764e2b2f060557a0315ea75ce71a4875299aa863b7564b6f41b711
3
+ size 3644060
context_encoding_model/_tp0_bk4/model.MODULE_95ef7ca73cc0a6161be2+96be3c33.hlo_module.pb ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8c564e37d09483fd3fa5207db2f0d41a54a9993b618c3243e9e641c74a7d8a5c
3
+ size 3730846
context_encoding_model/_tp0_bk4/model.MODULE_95ef7ca73cc0a6161be2+96be3c33.neff ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:83bfd67384e1a0c5645609060b8bfb6fc5cfe3dbbd75b7568508606e623f387d
3
+ size 1926144
context_encoding_model/_tp0_bk4/neuron_config.json ADDED
@@ -0,0 +1,224 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_attn_implementation_autoset": false,
3
+ "_name_or_path": "/home/ubuntu/models/Qwen3-1.7B",
4
+ "add_cross_attention": false,
5
+ "architectures": [
6
+ "Qwen3ForCausalLM"
7
+ ],
8
+ "attention_bias": false,
9
+ "attention_dropout": 0.0,
10
+ "attribute_map": {},
11
+ "bad_words_ids": null,
12
+ "begin_suppress_tokens": null,
13
+ "bos_token_id": 151643,
14
+ "chunk_size_feed_forward": 0,
15
+ "cross_attention_hidden_size": null,
16
+ "decoder_start_token_id": null,
17
+ "diversity_penalty": 0.0,
18
+ "do_sample": false,
19
+ "early_stopping": false,
20
+ "encoder_no_repeat_ngram_size": 0,
21
+ "eos_token_id": 151645,
22
+ "exponential_decay_length_penalty": null,
23
+ "finetuning_task": null,
24
+ "forced_bos_token_id": null,
25
+ "forced_eos_token_id": null,
26
+ "fused_spec_config": null,
27
+ "head_dim": 128,
28
+ "hidden_act": "silu",
29
+ "hidden_size": 2048,
30
+ "id2label": {
31
+ "0": "LABEL_0",
32
+ "1": "LABEL_1"
33
+ },
34
+ "initializer_range": 0.02,
35
+ "intermediate_size": 6144,
36
+ "is_decoder": false,
37
+ "is_encoder_decoder": false,
38
+ "label2id": {
39
+ "LABEL_0": 0,
40
+ "LABEL_1": 1
41
+ },
42
+ "length_penalty": 1.0,
43
+ "max_length": 20,
44
+ "max_position_embeddings": 40960,
45
+ "max_window_layers": 28,
46
+ "metadata": null,
47
+ "min_length": 0,
48
+ "model_type": "qwen3",
49
+ "neuron_config": {
50
+ "activation_quantization_type": null,
51
+ "allow_input_truncation": false,
52
+ "apply_seq_ids_mask": false,
53
+ "async_mode": false,
54
+ "attention_dp_degree": 1,
55
+ "attention_dtype": null,
56
+ "attn_block_cte_nki_kernel_enabled": false,
57
+ "attn_block_tkg_nki_kernel_cache_update": false,
58
+ "attn_block_tkg_nki_kernel_cascaded_attention": false,
59
+ "attn_block_tkg_nki_kernel_enabled": false,
60
+ "attn_cls": {
61
+ "__module__": "neuronx_distributed_inference.models.qwen3.modeling_qwen3",
62
+ "__name__": "NeuronQwen3Attention"
63
+ },
64
+ "attn_kernel_enabled": null,
65
+ "attn_tkg_builtin_kernel_enabled": false,
66
+ "attn_tkg_nki_kernel_enabled": false,
67
+ "batch_size": 1,
68
+ "bucket_n_active_tokens": true,
69
+ "buckets": [
70
+ 2048
71
+ ],
72
+ "cast_type": "config",
73
+ "cc_pipeline_tiling_factor": 2,
74
+ "chunked_prefill_config": null,
75
+ "context_encoding_buckets": [
76
+ 2048
77
+ ],
78
+ "cp_degree": 1,
79
+ "ctx_batch_size": 1,
80
+ "disable_kv_cache_tiling": false,
81
+ "draft_model_modules_to_not_convert": null,
82
+ "enable_bucketing": true,
83
+ "enable_cte_modular_flow": false,
84
+ "enable_eagle_draft_input_norm": false,
85
+ "enable_eagle_speculation": false,
86
+ "enable_fused_speculation": false,
87
+ "enable_long_context_mode": false,
88
+ "enable_output_completion_notifications": false,
89
+ "enable_spill_reload_dge": false,
90
+ "enable_token_tree": false,
91
+ "ep_degree": 1,
92
+ "expert_mlp_nki_kernel_enabled": null,
93
+ "flash_decoding_enabled": false,
94
+ "fused_qkv": false,
95
+ "fused_rmsnorm_skip_gamma": false,
96
+ "is_block_kv_layout": null,
97
+ "is_chunked_prefill": false,
98
+ "is_continuous_batching": true,
99
+ "is_eagle_draft": false,
100
+ "is_medusa": false,
101
+ "is_prefill_stage": true,
102
+ "is_prefix_caching": false,
103
+ "k_cache_transposed": false,
104
+ "kv_cache_batch_size": 8,
105
+ "kv_cache_padding_size": 0,
106
+ "kv_cache_quant": false,
107
+ "kv_cache_tiling": false,
108
+ "layer_boundary_markers": false,
109
+ "lm_head_pad": true,
110
+ "lm_head_pad_alignment_size": 1,
111
+ "local_ranks_size": 2,
112
+ "logical_nc_config": 2,
113
+ "lora_config": null,
114
+ "max_batch_size": 8,
115
+ "max_context_length": 4096,
116
+ "max_length": 4096,
117
+ "max_new_tokens": null,
118
+ "medusa_speculation_length": 0,
119
+ "medusa_tree": null,
120
+ "mlp_kernel_enabled": false,
121
+ "mlp_kernel_fuse_residual_add": false,
122
+ "modules_to_not_convert": null,
123
+ "moe_fused_nki_kernel_enabled": null,
124
+ "n_active_tokens": 4096,
125
+ "n_positions": 4096,
126
+ "num_medusa_heads": 0,
127
+ "on_cpu": false,
128
+ "on_device_sampling_config": {
129
+ "deterministic": false,
130
+ "do_sample": false,
131
+ "dynamic": true,
132
+ "global_topk": 256,
133
+ "on_device_sampling_config": true,
134
+ "temperature": 1.0,
135
+ "top_k": 1,
136
+ "top_k_kernel_enabled": false,
137
+ "top_p": 1.0
138
+ },
139
+ "output_logits": false,
140
+ "overrides_torch_dtype": true,
141
+ "pa_block_size": 4096,
142
+ "pa_num_blocks": 8,
143
+ "padding_side": "right",
144
+ "pp_degree": 1,
145
+ "prefix_buckets": null,
146
+ "qk_layernorm": false,
147
+ "qkv_kernel_enabled": false,
148
+ "qkv_kernel_fuse_residual_add": false,
149
+ "qkv_kernel_nbsd_layout": false,
150
+ "quantization_dtype": "int8",
151
+ "quantization_type": "per_tensor_symmetric",
152
+ "quantize_clamp_bound": Infinity,
153
+ "quantized": false,
154
+ "quantized_checkpoints_path": null,
155
+ "quantized_mlp_kernel_enabled": false,
156
+ "rmsnorm_quantize_kernel_enabled": false,
157
+ "router_topk_nki_kernel_enabled": null,
158
+ "rpl_reduce_dtype": null,
159
+ "save_sharded_checkpoint": true,
160
+ "scratchpad_page_size": null,
161
+ "seq_len": 4096,
162
+ "seq_len_threshold_for_cc_tiling": 16384,
163
+ "sequence_parallel_enabled": false,
164
+ "shared_mlp_nki_kernel_enabled": null,
165
+ "skip_sharding": false,
166
+ "skip_warmup": false,
167
+ "spec_batch_size": 8,
168
+ "speculation_length": 0,
169
+ "start_rank_id": 0,
170
+ "strided_context_parallel_kernel_enabled": false,
171
+ "target": null,
172
+ "tensor_capture_config": null,
173
+ "tile_cc": false,
174
+ "tkg_batch_size": 8,
175
+ "token_generation_buckets": null,
176
+ "token_tree_config": null,
177
+ "torch_dtype": "bfloat16",
178
+ "tp_degree": 2,
179
+ "vocab_parallel": false,
180
+ "weight_gather_seq_len_threshold": 32768,
181
+ "weights_to_skip_layout_optimization": [],
182
+ "world_size": 2
183
+ },
184
+ "no_repeat_ngram_size": 0,
185
+ "num_attention_heads": 16,
186
+ "num_beam_groups": 1,
187
+ "num_beams": 1,
188
+ "num_cores_per_group": 1,
189
+ "num_hidden_layers": 28,
190
+ "num_key_value_heads": 8,
191
+ "num_return_sequences": 1,
192
+ "output_attentions": false,
193
+ "output_hidden_states": false,
194
+ "output_scores": false,
195
+ "pad_token_id": 0,
196
+ "prefix": null,
197
+ "problem_type": null,
198
+ "pruned_heads": {},
199
+ "remove_invalid_values": false,
200
+ "repetition_penalty": 1.0,
201
+ "return_dict": true,
202
+ "return_dict_in_generate": false,
203
+ "rms_norm_eps": 1e-06,
204
+ "rope_scaling": null,
205
+ "rope_theta": 1000000,
206
+ "sep_token_id": null,
207
+ "sliding_window": null,
208
+ "suppress_tokens": null,
209
+ "task_specific_params": null,
210
+ "temperature": 1.0,
211
+ "tf_legacy_loss": false,
212
+ "tie_encoder_decoder": false,
213
+ "tie_word_embeddings": true,
214
+ "tokenizer_class": null,
215
+ "top_k": 50,
216
+ "top_p": 1.0,
217
+ "torchscript": false,
218
+ "transformers_version": "4.51.0",
219
+ "typical_p": 1.0,
220
+ "use_bfloat16": false,
221
+ "use_cache": true,
222
+ "use_sliding_window": false,
223
+ "vocab_size": 151936
224
+ }
context_encoding_model/_tp0_bk5/command.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ neuronx-cc compile --framework=XLA model.MODULE_96a8f4e12dc810958634+b1e26cef.hlo_module.pb --output model.MODULE_96a8f4e12dc810958634+b1e26cef.neff --target=trn2 --auto-cast=none --model-type=transformer '--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma ' --lnc=2 -O1 '--internal-hlo2tensorizer-options= --modular-flow-mac-threshold=10 --verify-hlo=true' --logfile=log-neuron-cc.txt --verbose=35
context_encoding_model/_tp0_bk5/compile_flags.MODULE_96a8f4e12dc810958634+b1e26cef.json ADDED
@@ -0,0 +1 @@
 
 
1
+ ["--target=trn2", "--auto-cast=none", "--model-type=transformer", "--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma ", "--lnc=2", "-O1", "--internal-hlo2tensorizer-options= --modular-flow-mac-threshold=10 --verify-hlo=true", "--logfile=/home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk5/log-neuron-cc.txt"]
context_encoding_model/_tp0_bk5/global_metric_store.json ADDED
@@ -0,0 +1,1177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "Average": {
3
+ "tensorizer": {
4
+ "StaticProfiler::AverageFractalPeUtilization": 99.12728881835938,
5
+ "StaticProfiler::AveragePartitionUtilization": 95.96998596191406,
6
+ "StaticProfiler::AveragePeUtilization": 97.68225860595703,
7
+ "StaticProfiler::LocalizationEfficiency": 56.908729553222656,
8
+ "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 63.73067855834961,
9
+ "TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0,
10
+ "TilingProfiler::AveragePeUtilizationAfterTiling": 0.0
11
+ }
12
+ },
13
+ "Count": {
14
+ "tensorizer": {
15
+ "StaticProfiler::AverageFractalPeUtilization": 1.0,
16
+ "StaticProfiler::AveragePartitionUtilization": 1.0,
17
+ "StaticProfiler::AveragePeUtilization": 1.0,
18
+ "StaticProfiler::LocalizationEfficiency": 1.0,
19
+ "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 1.0,
20
+ "TilingProfiler::AveragePartitionUtilizationAfterTiling": 1.0,
21
+ "TilingProfiler::AveragePeUtilizationAfterTiling": 1.0
22
+ }
23
+ },
24
+ "Sum": {
25
+ "compiletime": {
26
+ "AGOrderingAnalysisPass": 0.03893709182739258,
27
+ "AffinePredicateResolution": 0.00975942611694336,
28
+ "AliasDependencyElimination": 0.00020766258239746094,
29
+ "AliasDependencyInduction": 0.014848947525024414,
30
+ "AliasDependencyReset": 0.0507814884185791,
31
+ "BFComputeCutting": 0.004155397415161133,
32
+ "BirCodeGenLoop": 0.384446382522583,
33
+ "CCOpFusion": 0.11220550537109375,
34
+ "CanonicalizeConv": 1.8999999156221747e-05,
35
+ "CanonicalizeDAGForPGTiling": 0.013774633407592773,
36
+ "CanonicalizeForTensorizer": 5.0000002374872565e-05,
37
+ "CanonicalizeIR": 0.002764463424682617,
38
+ "Canonicalizer": 0.0008950000046752393,
39
+ "CoalesceCCOp": 0.01839923858642578,
40
+ "CommuteConcat": 0.0019075870513916016,
41
+ "DMALocalityOpt": 0.00996088981628418,
42
+ "DMAProfiler": 0.02422189712524414,
43
+ "DMATilingProfiler": 0.007188081741333008,
44
+ "DataLocalityOpt": 0.15634822845458984,
45
+ "DataStreaming": 0.03180813789367676,
46
+ "DeConcat": 0.0020532608032226563,
47
+ "DeadCodeElimination": 0.002146482467651367,
48
+ "DeadStoreElimination": 0.024139404296875,
49
+ "DelinearIndices": 0.013254880905151367,
50
+ "Delinearization": 0.007935047149658203,
51
+ "DelinearizeSPMD": 0.023029565811157227,
52
+ "DoNothing": 0.0005247592926025391,
53
+ "DramToDramTranspose": 0.012213945388793945,
54
+ "DumpGraphAndMetadata": 0.03455543518066406,
55
+ "EliminateDivs": 0.01893448829650879,
56
+ "ExpandBatchNorm": 0.007169485092163086,
57
+ "ExpandISAMacro": 0.019716739654541016,
58
+ "FactorizeBlkDims": 0.0747368335723877,
59
+ "FactorizeThreadAxesInFreeDims": 0.0075495243072509766,
60
+ "FlattenMacroLoop": 0.007609844207763672,
61
+ "GenericAccessSimplifier": 0.0013933181762695313,
62
+ "HoistCompute": 4.999999873689376e-06,
63
+ "IdentifyCrossPassTensors": 3.899999865097925e-05,
64
+ "InferInitValue": 0.10064125061035156,
65
+ "InferIntrinsicOnCC": 0.026311159133911133,
66
+ "InferNeuronTensor": 0.05008339881896973,
67
+ "InferNonlocalTensors": 0.05733203887939453,
68
+ "InferPSumTensor": 0.1221306324005127,
69
+ "InferShardAxis": 0.6304898262023926,
70
+ "InferSharedMemLoc": 0.0429539680480957,
71
+ "InlineNativeKernels": 0.00394749641418457,
72
+ "InsertCoreBarrier": 0.01845526695251465,
73
+ "InsertIOTransposes": 0.04183030128479004,
74
+ "InsertImplicitShardAxisBeforeISel": 0.01711416244506836,
75
+ "InsertLocalTransposes": 0.0077512264251708984,
76
+ "InsertOffloadedTransposes": 0.010181665420532227,
77
+ "LICM": 0.005186319351196289,
78
+ "LateLegalizeInst": 0.04364776611328125,
79
+ "LateLegalizePostSplit": 0.03845643997192383,
80
+ "LateLowerReshapeOp": 0.0019919872283935547,
81
+ "LateLowerTensorOp": 0.0022301673889160156,
82
+ "LateNeuronInstComb": 0.04980278015136719,
83
+ "LayoutPreprocessing": 0.05747699737548828,
84
+ "LayoutPreprocessingAndAnalysis": 0.09093403816223145,
85
+ "LayoutRequirementAnalysis": 0.010792970657348633,
86
+ "LegalizeCCOpLayout": 0.0032892227172851563,
87
+ "LegalizeOpLevelAlias": 0.0013661384582519531,
88
+ "LegalizePartitionReduce": 0.006167411804199219,
89
+ "LegalizeSundaAccess": 0.10145425796508789,
90
+ "LegalizeSundaMacro": 0.051756858825683594,
91
+ "LegalizeType": 0.07339167594909668,
92
+ "LocalLayoutOpt": 0.021276235580444336,
93
+ "LoopFusion": 0.006464719772338867,
94
+ "LoopSplitting": 0.0007054805755615234,
95
+ "LowerBroadcast": 0.01979851722717285,
96
+ "LowerCCOpBlockAxis": 0.008892297744750977,
97
+ "LowerComplexBroadcast": 0.0035398006439208984,
98
+ "LowerIntrinsics": 0.05094194412231445,
99
+ "LowerShardAxis": 0.04483389854431152,
100
+ "LowerTensorOp": 0.025528907775878906,
101
+ "LowerToSendRecv": 0.04537153244018555,
102
+ "LowerTranspose": 0.040845394134521484,
103
+ "MacroGeneration": 0.08503556251525879,
104
+ "MaskPropagation": 0.007714748382568359,
105
+ "MemcastMotion": 1.9999999494757503e-05,
106
+ "MemcpyElimination": 0.062020301818847656,
107
+ "MutateDataType": 0.0020122528076171875,
108
+ "NeuronAliasDependencyInduction": 0.0006520748138427734,
109
+ "NeuronAliasDependencyReset": 0.10503625869750977,
110
+ "NeuronInstComb": 0.057951927185058594,
111
+ "NeuronLICM": 0.05489492416381836,
112
+ "NeuronLoopFusion": 0.05422854423522949,
113
+ "NeuronLoopInterchange": 0.0029349327087402344,
114
+ "NeuronSimplifier": 0.026484966278076172,
115
+ "NeuronSimplifyPredicates": 0.04440903663635254,
116
+ "NeuronValueNumbering": 0.02174234390258789,
117
+ "OptimizeAliasedCopyChain": 0.0018880367279052734,
118
+ "OptimizeNKIKernels": 4.115047454833984,
119
+ "PAGLayoutOpt": 0.11529350280761719,
120
+ "PComputeCutting": 0.010918140411376953,
121
+ "PGLayoutTilingPipeline": 1.6512439250946045,
122
+ "PGTiling": 0.2841973304748535,
123
+ "PadElimination": 0.0008590221405029297,
124
+ "ParAxesAnnotation": 0.07899093627929688,
125
+ "PartialLoopFusion": 0.03534102439880371,
126
+ "PartialSimdFusion": 0.021408557891845703,
127
+ "PenguinizeFunctions": 4.70000013592653e-05,
128
+ "PerfectLoopNest": 0.008621454238891602,
129
+ "PruneFunctions": 4.70000013592653e-05,
130
+ "RecognizeOpIdiom": 0.010253190994262695,
131
+ "Recompute": 0.0005791187286376953,
132
+ "RelaxPredicates": 0.013797521591186523,
133
+ "Rematerialization": 0.0054569244384765625,
134
+ "RemoveOptimizationBarriers": 4.5000000682193786e-05,
135
+ "RemoveShardedPartitionAxes": 0.03261446952819824,
136
+ "ReshapeWeights": 0.001524209976196289,
137
+ "ResolveAccessConflict": 0.019870281219482422,
138
+ "ResolveComplicatePredicates": 0.0053920745849609375,
139
+ "RewriteReplicationMatmul": 0.0025107860565185547,
140
+ "RewriteWeights": 0.009802579879760742,
141
+ "SFKVectorizer": 0.3575756549835205,
142
+ "ScatterMotion": 3.899999865097925e-05,
143
+ "ShardingPropagationAnalysis": 0.10757136344909668,
144
+ "SimpleAllReduceTiling": 0.015942096710205078,
145
+ "Simplifier": 0.005366325378417969,
146
+ "SimplifyMacroPredicates": 0.016243934631347656,
147
+ "SimplifyNeuronTensor": 0.16969990730285645,
148
+ "SimplifySlice": 0.002231597900390625,
149
+ "SimplifyTensor": 0.017529726028442383,
150
+ "SpillPSum": 0.20494413375854492,
151
+ "SplitAPUnionSets": 0.15779972076416016,
152
+ "SplitAccGrp": 0.005539894104003906,
153
+ "StaticProfiler": 0.046514272689819336,
154
+ "StaticTransposeLocalTensor": 0.008464574813842773,
155
+ "SundaISel": 0.07130837440490723,
156
+ "TCTransform": 0.002462148666381836,
157
+ "TensorInitialization": 0.011480093002319336,
158
+ "TensorOpSimplifier": 0.008947134017944336,
159
+ "TensorOpTransform": 0.06947088241577148,
160
+ "TensorizerLegalizationPass": 5.699999746866524e-05,
161
+ "TileCCOps": 0.012774467468261719,
162
+ "TilingProfiler": 0.014863967895507813,
163
+ "TransformConvOp": 0.006424665451049805,
164
+ "TritiumFusion": 0.11082077026367188,
165
+ "ValueNumbering": 0.0049648284912109375,
166
+ "VectorizeDMA": 0.004624843597412109,
167
+ "VectorizeMatMult": 0.028928518295288086,
168
+ "VerifySupportedOps": 3.899999865097925e-05,
169
+ "WeightCoalescing": 0.013041973114013672,
170
+ "ZeroSizeTensorElimination": 0.00021529197692871094,
171
+ "algsimp": 0.001961000030860305,
172
+ "batchnorm_expander": 3.5000000934815034e-05,
173
+ "boundary-marker-removal": 1.1000000085914508e-05,
174
+ "call-inliner": 0.0003279999946244061,
175
+ "canonicalize-boundary-marker": 1.3999999282532372e-05,
176
+ "collective-stream-id-checker": 9.40000027185306e-05,
177
+ "comparison-expander": 0.000506000011228025,
178
+ "computation-deduplicator": 5.499999679159373e-05,
179
+ "config-lowering": 0.00011899999663000926,
180
+ "constant-statistics": 0.0004400000034365803,
181
+ "constant_folding": 0.00030700000934302807,
182
+ "cse": 3.5000000934815034e-05,
183
+ "dce": 7.700000423938036e-05,
184
+ "dot_decomposer": 0.0009110000100918114,
185
+ "dynamic-slice-transpose": 1.1000000085914508e-05,
186
+ "eliminate-redundant-compare": 0.00028899998869746923,
187
+ "emit-offloaded-dropout": 5.6000000768108293e-05,
188
+ "flatten-call-graph": 0.0006600000197067857,
189
+ "fuse-send-recv": 5.5999997130129486e-05,
190
+ "hilo-conditional-to-select": 1.3999999282532372e-05,
191
+ "hilo::LegalizeAlias": 1.2000000424450263e-05,
192
+ "hilo::NeuronInstCombine": 0.00012000000424450263,
193
+ "hilo::NeuronOpFusion": 1.4999999621068127e-05,
194
+ "hilo::ReplaceTokenTypeWithU8Pass": 3.599999763537198e-05,
195
+ "hilo::ScheduleFusion": 4.999999873689376e-06,
196
+ "hilo::SixtyFourHack": 9.899999713525176e-05,
197
+ "hilo::VerifyAliasing": 6.000000212225132e-06,
198
+ "hlo-mac-count": 0.012987000867724419,
199
+ "instruction-histogram": 0.0007619999814778566,
200
+ "io-con-pipe-begin": 7.000000096013537e-06,
201
+ "io-con-pipe-end": 9.999999974752427e-07,
202
+ "io-layout-normalization": 0.001361000002361834,
203
+ "io-statistics": 5.6000000768108293e-05,
204
+ "legalize-ccops-for-tensorizer": 3.000000106112566e-06,
205
+ "legalize-compare": 1.1000000085914508e-05,
206
+ "lower-argminmax-custom-call": 9.999999747378752e-06,
207
+ "map-inline": 0.0008399999933317304,
208
+ "metadata-naming": 4.400000034365803e-05,
209
+ "mlir::detail::OpToOpPassAdaptor": 5.7999997807201e-05,
210
+ "mlir::hlo::MhloToPyPenguin": 0.010812999680638313,
211
+ "mlir::mhlo::LowerComplexExtraPass": 0.0003440000000409782,
212
+ "mlir::mhlo::LowerComplexPass": 0.0004799999878741801,
213
+ "native-to-custom-softmax": 0.00035600000410340726,
214
+ "native-to-custom-softmax-dx": 0.0006880000000819564,
215
+ "neuron-hlo-verifier": 0.01168300025165081,
216
+ "operand_upcaster": 5.5999997130129486e-05,
217
+ "opt-barrier-removal": 0.0003150000120513141,
218
+ "post-par-pipe-begin": 1.4000000192027073e-05,
219
+ "post-par-pipe-end": 0.0,
220
+ "post-partition-simplification": 0.0015290000010281801,
221
+ "pre-par-pipe-begin": 9.999999974752427e-07,
222
+ "pre-par-pipe-end": 0.0,
223
+ "pre-partition-simplification": 0.09849400073289871,
224
+ "replace-minimum-constant": 0.0004569999873638153,
225
+ "reshape-mover": 0.0001030000057653524,
226
+ "simplify-concat": 0.00010699999984353781,
227
+ "simplify-while-loops": 7.700000423938036e-05,
228
+ "transform-variadic-reduce": 5.8999998145736754e-05,
229
+ "tuple-simplifier": 0.00025900002219714224,
230
+ "unpack-nested-aws-ntwsr": 0.00023599999258294702,
231
+ "unroll-while-loop": 1.2000000424450263e-05,
232
+ "zero_sized_hlo_elimination": 0.000783999974373728
233
+ },
234
+ "hilo": {
235
+ "ConstantSize": 7348863.0,
236
+ "HloInputCount": 371.0,
237
+ "HloMacCount": 240674799616.0,
238
+ "HloOutputCount": 57.0,
239
+ "IfmapSize": 3910944768.0,
240
+ "OfmapSize": 1879048192.0,
241
+ "OutputsReadFromCount": 0.0,
242
+ "PassthroughTensorsCount": 0.0,
243
+ "RedundantOutputCount": 0.0,
244
+ "Traffic": 1088551040.0
245
+ },
246
+ "tensorizer": {
247
+ "DMATilingProfiler::TotalInstructionsAfterTiling": 31232.0,
248
+ "StaticProfiler::AifUb": 538.6357421875,
249
+ "StaticProfiler::ArithmeticIntensityTensorizer": 306.53076171875,
250
+ "StaticProfiler::AverageDmaLength": 2517.368896484375,
251
+ "StaticProfiler::DDRTransferBytes": 672177216.0,
252
+ "StaticProfiler::InternalTransferBytes": 407820064.0,
253
+ "StaticProfiler::LoadExpanded": 189029.0,
254
+ "StaticProfiler::StoreExpanded": 13673.0,
255
+ "StaticProfiler::TotalDMAExpanded": 202702.0,
256
+ "StaticProfiler::TotalDynamicInstancesCount": 37700.0,
257
+ "StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 37249.0,
258
+ "StaticProfiler::TotalLNCComm": 0.0,
259
+ "StaticProfiler::TotalLNCCommTransfer": 0.0,
260
+ "TilingProfiler::BatchnormInstructionsAfterTiling": 0.0,
261
+ "TilingProfiler::DmaInstructionsAfterTiling": 0.0,
262
+ "TilingProfiler::GenericInstructionsAfterTiling": 4.0,
263
+ "TilingProfiler::MatMultInstructionsAfterTiling": 18720.0,
264
+ "TilingProfiler::NumPfTransposes": 5.0,
265
+ "TilingProfiler::NumPfTransposesForIo": 1.0,
266
+ "TilingProfiler::NumPfTransposesForLocal": 1.0,
267
+ "TilingProfiler::NumPfTransposesForNonlocal": 3.0,
268
+ "TilingProfiler::PfTransposeInstructions": 11041.0,
269
+ "TilingProfiler::PfTransposeInstructionsForIo": 9504.0,
270
+ "TilingProfiler::PfTransposeInstructionsForLocal": 1.0,
271
+ "TilingProfiler::PfTransposeInstructionsForNonlocal": 1536.0,
272
+ "TilingProfiler::ReduceInstructionsAfterTiling": 18.0,
273
+ "TilingProfiler::SimdInstructionsAfterTiling": 604.0,
274
+ "TilingProfiler::TotalInstructionsAfterTiling": 0.0,
275
+ "TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0,
276
+ "TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0,
277
+ "TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0,
278
+ "TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0,
279
+ "TransformConvOp::conv2d_column_packing": 0.0,
280
+ "TransformConvOp::conv2d_column_packing_1": 0.0,
281
+ "TransformConvOp::conv2d_column_packing_io10": 0.0,
282
+ "TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0
283
+ }
284
+ },
285
+ "all": {
286
+ "compiletime": {
287
+ "algsimp": 0.0018090000376105309,
288
+ "call-inliner": 0.0003000000142492354,
289
+ "collective-stream-id-checker": 8.499999967170879e-05,
290
+ "comparison-expander": 0.0004920000210404396,
291
+ "constant-statistics": 0.0004400000034365803,
292
+ "constant_folding": 0.00028300000121816993,
293
+ "dce": 7.400000322377309e-05,
294
+ "dot_decomposer": 0.0009110000100918114,
295
+ "eliminate-redundant-compare": 0.0002789999998640269,
296
+ "flatten-call-graph": 0.0006380000268109143,
297
+ "hlo-mac-count": 0.007658000104129314,
298
+ "instruction-histogram": 0.0007619999814778566,
299
+ "io-con-pipe-begin": 7.000000096013537e-06,
300
+ "io-con-pipe-end": 9.999999974752427e-07,
301
+ "io-layout-normalization": 0.001361000002361834,
302
+ "io-statistics": 5.6000000768108293e-05,
303
+ "map-inline": 0.0008089999901130795,
304
+ "native-to-custom-softmax": 0.000307999987853691,
305
+ "native-to-custom-softmax-dx": 0.0004140000091865659,
306
+ "neuron-hlo-verifier": 0.010607999749481678,
307
+ "opt-barrier-removal": 0.0003150000120513141,
308
+ "pre-par-pipe-begin": 9.999999974752427e-07,
309
+ "pre-par-pipe-end": 0.0,
310
+ "pre-partition-simplification": 0.09849400073289871,
311
+ "replace-minimum-constant": 0.00043899999582208693,
312
+ "reshape-mover": 9.500000305706635e-05,
313
+ "simplify-while-loops": 7.100000220816582e-05,
314
+ "tuple-simplifier": 0.0002460000105202198,
315
+ "unpack-nested-aws-ntwsr": 0.00022600000374950469,
316
+ "unroll-while-loop": 1.2000000424450263e-05,
317
+ "zero_sized_hlo_elimination": 0.000783999974373728
318
+ }
319
+ },
320
+ "attention_isa_kernel": {
321
+ "compiletime": {
322
+ "CoalesceCCOp": 0.00021338462829589844,
323
+ "DMALocalityOpt": 0.0002186298370361328,
324
+ "DMAProfiler": 0.00027632713317871094,
325
+ "DataStreaming": 0.00021409988403320313,
326
+ "DoNothing": 0.0033321380615234375,
327
+ "ExpandISAMacro": 0.00029921531677246094,
328
+ "FactorizeBlkDims": 0.000396728515625,
329
+ "InferPSumTensor": 0.0006210803985595703,
330
+ "InferSharedMemLoc": 0.0006666183471679688,
331
+ "InsertCoreBarrier": 0.00035572052001953125,
332
+ "LateLegalizeInst": 0.00023174285888671875,
333
+ "LateNeuronInstComb": 0.000492095947265625,
334
+ "LegalizeSundaAccess": 0.0002181529998779297,
335
+ "LegalizeType": 0.0002846717834472656,
336
+ "LowerBroadcast": 0.00025916099548339844,
337
+ "LowerIntrinsics": 0.00029730796813964844,
338
+ "LowerTranspose": 0.0002589225769042969,
339
+ "NeuronInstComb": 0.000469207763671875,
340
+ "NeuronLICM": 0.00020599365234375,
341
+ "NeuronSimplifyPredicates": 0.0002067089080810547,
342
+ "NeuronValueNumbering": 0.0002777576446533203,
343
+ "SFKVectorizer": 0.0018928050994873047,
344
+ "SimpleAllReduceTiling": 0.00020241737365722656,
345
+ "SimplifyNeuronTensor": 0.0006334781646728516,
346
+ "SpillPSum": 0.0007383823394775391,
347
+ "WeightCoalescing": 0.00025081634521484375
348
+ }
349
+ },
350
+ "cumsum": {
351
+ "compiletime": {
352
+ "CoalesceCCOp": 0.0003447532653808594,
353
+ "DMALocalityOpt": 0.0003294944763183594,
354
+ "DMAProfiler": 0.0012810230255126953,
355
+ "DataStreaming": 0.0005331039428710938,
356
+ "DoNothing": 0.00017762184143066406,
357
+ "ExpandISAMacro": 0.0009202957153320313,
358
+ "FactorizeBlkDims": 0.0006163120269775391,
359
+ "InferPSumTensor": 0.0011057853698730469,
360
+ "InferSharedMemLoc": 0.0004899501800537109,
361
+ "InsertCoreBarrier": 0.0004894733428955078,
362
+ "LateLegalizeInst": 0.0006704330444335938,
363
+ "LateNeuronInstComb": 0.0013632774353027344,
364
+ "LegalizeSundaAccess": 0.0025315284729003906,
365
+ "LegalizeType": 0.00039649009704589844,
366
+ "LowerBroadcast": 0.0004820823669433594,
367
+ "LowerIntrinsics": 0.0004119873046875,
368
+ "LowerTranspose": 0.0004839897155761719,
369
+ "NeuronInstComb": 0.0013201236724853516,
370
+ "NeuronLICM": 0.0006861686706542969,
371
+ "NeuronSimplifyPredicates": 0.011016607284545898,
372
+ "NeuronValueNumbering": 0.0007073879241943359,
373
+ "SFKVectorizer": 0.012517213821411133,
374
+ "SimpleAllReduceTiling": 0.0003895759582519531,
375
+ "SimplifyNeuronTensor": 0.0022177696228027344,
376
+ "SpillPSum": 0.0009493827819824219,
377
+ "WeightCoalescing": 0.00035071372985839844
378
+ }
379
+ },
380
+ "sg00": {
381
+ "compiletime": {
382
+ "CanonicalizeConv": 1.2999999853491317e-05,
383
+ "CanonicalizeForTensorizer": 1.8000000636675395e-05,
384
+ "Canonicalizer": 0.00029600001289509237,
385
+ "HoistCompute": 3.000000106112566e-06,
386
+ "IdentifyCrossPassTensors": 1.4000000192027073e-05,
387
+ "MemcastMotion": 7.999999979801942e-06,
388
+ "PenguinizeFunctions": 1.8000000636675395e-05,
389
+ "PruneFunctions": 3.000000106112566e-06,
390
+ "RemoveOptimizationBarriers": 1.700000029813964e-05,
391
+ "ScatterMotion": 1.700000029813964e-05,
392
+ "TensorizerLegalizationPass": 2.9000000722589903e-05,
393
+ "VerifySupportedOps": 1.2000000424450263e-05,
394
+ "algsimp": 4.8000001697801054e-05,
395
+ "batchnorm_expander": 1.2000000424450263e-05,
396
+ "boundary-marker-removal": 3.999999989900971e-06,
397
+ "call-inliner": 7.000000096013537e-06,
398
+ "canonicalize-boundary-marker": 4.999999873689376e-06,
399
+ "collective-stream-id-checker": 3.000000106112566e-06,
400
+ "comparison-expander": 3.999999989900971e-06,
401
+ "computation-deduplicator": 1.5999999959603883e-05,
402
+ "config-lowering": 4.3000000005122274e-05,
403
+ "constant_folding": 7.000000096013537e-06,
404
+ "cse": 1.1000000085914508e-05,
405
+ "dce": 9.999999974752427e-07,
406
+ "dynamic-slice-transpose": 3.999999989900971e-06,
407
+ "eliminate-redundant-compare": 3.000000106112566e-06,
408
+ "emit-offloaded-dropout": 3.199999991920777e-05,
409
+ "flatten-call-graph": 7.000000096013537e-06,
410
+ "fuse-send-recv": 1.8000000636675395e-05,
411
+ "hilo-conditional-to-select": 3.999999989900971e-06,
412
+ "hilo::LegalizeAlias": 4.999999873689376e-06,
413
+ "hilo::NeuronInstCombine": 4.8999998398358e-05,
414
+ "hilo::NeuronOpFusion": 4.999999873689376e-06,
415
+ "hilo::ReplaceTokenTypeWithU8Pass": 1.4000000192027073e-05,
416
+ "hilo::ScheduleFusion": 1.9999999949504854e-06,
417
+ "hilo::SixtyFourHack": 1.5999999959603883e-05,
418
+ "hilo::VerifyAliasing": 3.000000106112566e-06,
419
+ "hlo-mac-count": 8.299999899463728e-05,
420
+ "legalize-ccops-for-tensorizer": 9.999999974752427e-07,
421
+ "legalize-compare": 3.999999989900971e-06,
422
+ "lower-argminmax-custom-call": 3.000000106112566e-06,
423
+ "map-inline": 9.999999747378752e-06,
424
+ "metadata-naming": 1.2999999853491317e-05,
425
+ "mlir::detail::OpToOpPassAdaptor": 1.9999999494757503e-05,
426
+ "mlir::hlo::MhloToPyPenguin": 0.0010160000529140234,
427
+ "mlir::mhlo::LowerComplexExtraPass": 0.00013899999612476677,
428
+ "mlir::mhlo::LowerComplexPass": 0.0002699999895412475,
429
+ "native-to-custom-softmax": 3.7999998312443495e-05,
430
+ "native-to-custom-softmax-dx": 0.00024399999529123306,
431
+ "neuron-hlo-verifier": 0.0003870000073220581,
432
+ "operand_upcaster": 1.700000029813964e-05,
433
+ "post-par-pipe-begin": 1.2000000424450263e-05,
434
+ "post-par-pipe-end": 0.0,
435
+ "post-partition-simplification": 0.0005039999959990382,
436
+ "replace-minimum-constant": 4.999999873689376e-06,
437
+ "reshape-mover": 3.000000106112566e-06,
438
+ "simplify-concat": 3.300000025774352e-05,
439
+ "simplify-while-loops": 1.9999999949504854e-06,
440
+ "transform-variadic-reduce": 7.000000096013537e-06,
441
+ "tuple-simplifier": 3.999999989900971e-06,
442
+ "unpack-nested-aws-ntwsr": 3.000000106112566e-06,
443
+ "unroll-while-loop": 0.0
444
+ },
445
+ "hilo": {
446
+ "ArithmeticIntensity": 183.30274963378906,
447
+ "ConstantSize": 7348863.0,
448
+ "HloInputCount": 371.0,
449
+ "HloMacCount": 42949672960.0,
450
+ "HloOutputCount": 57.0,
451
+ "IfmapSize": 3910944768.0,
452
+ "OfmapSize": 1879048192.0,
453
+ "OutputsReadFromCount": 0.0,
454
+ "PassthroughTensorsCount": 0.0,
455
+ "RedundantOutputCount": 0.0,
456
+ "Traffic": 468620064.0
457
+ }
458
+ },
459
+ "sg0000": {
460
+ "compiletime": {
461
+ "AGOrderingAnalysisPass": 0.04064464569091797,
462
+ "AffinePredicateResolution": 0.0019383430480957031,
463
+ "AliasDependencyElimination": 0.00022459030151367188,
464
+ "AliasDependencyInduction": 0.019460439682006836,
465
+ "AliasDependencyReset": 0.04814887046813965,
466
+ "BFComputeCutting": 0.0055506229400634766,
467
+ "BirCodeGenLoop": 0.13215899467468262,
468
+ "CCOpFusion": 0.11969184875488281,
469
+ "CanonicalizeDAGForPGTiling": 0.0033049583435058594,
470
+ "CanonicalizeIR": 0.0060040950775146484,
471
+ "CoalesceCCOp": 0.0054624080657958984,
472
+ "CommuteConcat": 0.002767324447631836,
473
+ "DMALocalityOpt": 0.0027179718017578125,
474
+ "DMAProfiler": 0.01582622528076172,
475
+ "DMATilingProfiler": 0.008585929870605469,
476
+ "DataLocalityOpt": 0.2429823875427246,
477
+ "DataStreaming": 0.013686180114746094,
478
+ "DeConcat": 0.0028448104858398438,
479
+ "DeadCodeElimination": 0.00874471664428711,
480
+ "DeadStoreElimination": 0.07823586463928223,
481
+ "DelinearIndices": 0.01836085319519043,
482
+ "Delinearization": 0.009904146194458008,
483
+ "DelinearizeSPMD": 0.03007340431213379,
484
+ "DoNothing": 9.870529174804688e-05,
485
+ "DramToDramTranspose": 0.014807701110839844,
486
+ "DumpGraphAndMetadata": 0.00868082046508789,
487
+ "EliminateDivs": 0.005564212799072266,
488
+ "ExpandBatchNorm": 0.0029854774475097656,
489
+ "ExpandISAMacro": 0.006433963775634766,
490
+ "FactorizeBlkDims": 0.06867551803588867,
491
+ "FactorizeThreadAxesInFreeDims": 0.008321523666381836,
492
+ "FlattenMacroLoop": 0.006778717041015625,
493
+ "GenericAccessSimplifier": 0.0014896392822265625,
494
+ "InferInitValue": 0.06406569480895996,
495
+ "InferIntrinsicOnCC": 0.022037982940673828,
496
+ "InferNeuronTensor": 0.06763529777526855,
497
+ "InferNonlocalTensors": 0.22275519371032715,
498
+ "InferPSumTensor": 0.15494084358215332,
499
+ "InferShardAxis": 0.5209276676177979,
500
+ "InferSharedMemLoc": 0.017581939697265625,
501
+ "InlineNativeKernels": 0.007895946502685547,
502
+ "InsertCoreBarrier": 0.014360427856445313,
503
+ "InsertIOTransposes": 0.026629209518432617,
504
+ "InsertImplicitShardAxisBeforeISel": 0.018111467361450195,
505
+ "InsertLocalTransposes": 0.02471637725830078,
506
+ "InsertOffloadedTransposes": 0.018056154251098633,
507
+ "LICM": 0.006089210510253906,
508
+ "LateLegalizeInst": 0.020943164825439453,
509
+ "LateLegalizePostSplit": 0.01616668701171875,
510
+ "LateLowerReshapeOp": 0.004019498825073242,
511
+ "LateLowerTensorOp": 0.014237642288208008,
512
+ "LateNeuronInstComb": 0.02029895782470703,
513
+ "LayoutPreprocessing": 0.09618091583251953,
514
+ "LayoutPreprocessingAndAnalysis": 0.1460561752319336,
515
+ "LayoutRequirementAnalysis": 0.01375579833984375,
516
+ "LegalizeCCOpLayout": 0.004752159118652344,
517
+ "LegalizeOpLevelAlias": 0.001943826675415039,
518
+ "LegalizePartitionReduce": 0.002205371856689453,
519
+ "LegalizeSundaAccess": 0.08727788925170898,
520
+ "LegalizeSundaMacro": 0.017870187759399414,
521
+ "LegalizeType": 0.01916980743408203,
522
+ "LocalLayoutOpt": 0.049512386322021484,
523
+ "LoopFusion": 0.012260913848876953,
524
+ "LoopSplitting": 0.0006864070892333984,
525
+ "LowerBroadcast": 0.006807088851928711,
526
+ "LowerCCOpBlockAxis": 0.007787466049194336,
527
+ "LowerComplexBroadcast": 0.004546642303466797,
528
+ "LowerIntrinsics": 0.04405355453491211,
529
+ "LowerShardAxis": 0.033060312271118164,
530
+ "LowerTensorOp": 0.026821613311767578,
531
+ "LowerToSendRecv": 0.011995553970336914,
532
+ "LowerTranspose": 0.02594161033630371,
533
+ "MacroGeneration": 0.11522269248962402,
534
+ "MaskPropagation": 0.003435373306274414,
535
+ "MemcpyElimination": 0.2497720718383789,
536
+ "MutateDataType": 0.0027208328247070313,
537
+ "NeuronAliasDependencyInduction": 0.002033233642578125,
538
+ "NeuronAliasDependencyReset": 0.07921051979064941,
539
+ "NeuronInstComb": 0.018134593963623047,
540
+ "NeuronLICM": 0.037050724029541016,
541
+ "NeuronLoopFusion": 0.037982940673828125,
542
+ "NeuronLoopInterchange": 0.0038917064666748047,
543
+ "NeuronSimplifier": 0.022843360900878906,
544
+ "NeuronSimplifyPredicates": 0.003104686737060547,
545
+ "NeuronValueNumbering": 0.009130239486694336,
546
+ "OptimizeAliasedCopyChain": 0.004662990570068359,
547
+ "OptimizeNKIKernels": 0.3685793876647949,
548
+ "PAGLayoutOpt": 0.6570594310760498,
549
+ "PComputeCutting": 0.012747764587402344,
550
+ "PGLayoutTilingPipeline": 2.4684011936187744,
551
+ "PGTiling": 0.4522573947906494,
552
+ "PadElimination": 0.005415916442871094,
553
+ "ParAxesAnnotation": 0.5855293273925781,
554
+ "PartialLoopFusion": 0.06675910949707031,
555
+ "PartialSimdFusion": 0.07990288734436035,
556
+ "PerfectLoopNest": 0.004445075988769531,
557
+ "RecognizeOpIdiom": 0.02440333366394043,
558
+ "Recompute": 0.0006387233734130859,
559
+ "RelaxPredicates": 0.0069468021392822266,
560
+ "Rematerialization": 0.011609554290771484,
561
+ "RemoveShardedPartitionAxes": 0.029452085494995117,
562
+ "ReshapeWeights": 0.0011801719665527344,
563
+ "ResolveAccessConflict": 0.012258052825927734,
564
+ "ResolveComplicatePredicates": 0.0021598339080810547,
565
+ "RewriteReplicationMatmul": 0.0023620128631591797,
566
+ "RewriteWeights": 0.005594730377197266,
567
+ "SFKVectorizer": 0.6774003505706787,
568
+ "ShardingPropagationAnalysis": 0.07418251037597656,
569
+ "SimpleAllReduceTiling": 0.011443138122558594,
570
+ "Simplifier": 0.006997346878051758,
571
+ "SimplifyMacroPredicates": 0.010604381561279297,
572
+ "SimplifyNeuronTensor": 0.026854515075683594,
573
+ "SimplifySlice": 0.0022373199462890625,
574
+ "SimplifyTensor": 0.013662576675415039,
575
+ "SpillPSum": 0.04489874839782715,
576
+ "SplitAPUnionSets": 0.09562921524047852,
577
+ "SplitAccGrp": 0.0030364990234375,
578
+ "StaticProfiler": 0.02321648597717285,
579
+ "StaticTransposeLocalTensor": 0.004773139953613281,
580
+ "SundaISel": 0.08316183090209961,
581
+ "TCTransform": 0.0036308765411376953,
582
+ "TensorInitialization": 0.008217096328735352,
583
+ "TensorOpSimplifier": 0.013900995254516602,
584
+ "TensorOpTransform": 0.04661202430725098,
585
+ "TileCCOps": 0.03966546058654785,
586
+ "TilingProfiler": 0.02010059356689453,
587
+ "TransformConvOp": 0.00817561149597168,
588
+ "TritiumFusion": 0.12114953994750977,
589
+ "ValueNumbering": 0.01564812660217285,
590
+ "VectorizeDMA": 0.007418394088745117,
591
+ "VectorizeMatMult": 0.042043209075927734,
592
+ "WeightCoalescing": 0.008504390716552734,
593
+ "ZeroSizeTensorElimination": 0.0001614093780517578
594
+ },
595
+ "tensorizer": {
596
+ "DMATilingProfiler::TotalInstructionsAfterTiling": 6983.0,
597
+ "StaticProfiler::AifUb": 127.67816925048828,
598
+ "StaticProfiler::ArithmeticIntensityTensorizer": 265.79534912109375,
599
+ "StaticProfiler::AverageDmaLength": 2094.913818359375,
600
+ "StaticProfiler::AverageFractalPeUtilization": 99.83814239501953,
601
+ "StaticProfiler::AveragePartitionUtilization": 99.57943725585938,
602
+ "StaticProfiler::AveragePeUtilization": 99.35083770751953,
603
+ "StaticProfiler::DDRTransferBytes": 237259264.0,
604
+ "StaticProfiler::InternalTransferBytes": 225476608.0,
605
+ "StaticProfiler::LoadExpanded": 36391.0,
606
+ "StaticProfiler::LocalizationEfficiency": 208.176025390625,
607
+ "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 340.6685791015625,
608
+ "StaticProfiler::StoreExpanded": 27137.0,
609
+ "StaticProfiler::TotalDMAExpanded": 63528.0,
610
+ "StaticProfiler::TotalDynamicInstancesCount": 10455.0,
611
+ "StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 10430.0,
612
+ "StaticProfiler::TotalLNCComm": 0.0,
613
+ "StaticProfiler::TotalLNCCommTransfer": 0.0,
614
+ "TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0,
615
+ "TilingProfiler::AveragePeUtilizationAfterTiling": 0.0,
616
+ "TilingProfiler::BatchnormInstructionsAfterTiling": 0.0,
617
+ "TilingProfiler::DmaInstructionsAfterTiling": 0.0,
618
+ "TilingProfiler::GenericInstructionsAfterTiling": 384.0,
619
+ "TilingProfiler::MatMultInstructionsAfterTiling": 3104.0,
620
+ "TilingProfiler::NumPfTransposes": 7.0,
621
+ "TilingProfiler::NumPfTransposesForIo": 0.0,
622
+ "TilingProfiler::NumPfTransposesForLocal": 3.0,
623
+ "TilingProfiler::NumPfTransposesForNonlocal": 4.0,
624
+ "TilingProfiler::PfTransposeInstructions": 1792.0,
625
+ "TilingProfiler::PfTransposeInstructionsForIo": 0.0,
626
+ "TilingProfiler::PfTransposeInstructionsForLocal": 512.0,
627
+ "TilingProfiler::PfTransposeInstructionsForNonlocal": 1280.0,
628
+ "TilingProfiler::ReduceInstructionsAfterTiling": 0.0,
629
+ "TilingProfiler::SimdInstructionsAfterTiling": 932.0,
630
+ "TilingProfiler::TotalInstructionsAfterTiling": 0.0,
631
+ "TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0,
632
+ "TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0,
633
+ "TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0,
634
+ "TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0,
635
+ "TransformConvOp::conv2d_column_packing": 0.0,
636
+ "TransformConvOp::conv2d_column_packing_1": 0.0,
637
+ "TransformConvOp::conv2d_column_packing_io10": 0.0,
638
+ "TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0
639
+ }
640
+ },
641
+ "sg0001": {
642
+ "compiletime": {
643
+ "AGOrderingAnalysisPass": 0.09063863754272461,
644
+ "AffinePredicateResolution": 0.0031011104583740234,
645
+ "AliasDependencyElimination": 0.000255584716796875,
646
+ "AliasDependencyInduction": 0.012615680694580078,
647
+ "AliasDependencyReset": 0.04242563247680664,
648
+ "BFComputeCutting": 0.005561113357543945,
649
+ "BirCodeGenLoop": 0.09979081153869629,
650
+ "CCOpFusion": 0.1346728801727295,
651
+ "CanonicalizeDAGForPGTiling": 0.012668848037719727,
652
+ "CanonicalizeIR": 0.005399465560913086,
653
+ "CoalesceCCOp": 0.007870197296142578,
654
+ "CommuteConcat": 0.002213716506958008,
655
+ "DMALocalityOpt": 0.008012056350708008,
656
+ "DMAProfiler": 0.017035484313964844,
657
+ "DMATilingProfiler": 0.014662027359008789,
658
+ "DataLocalityOpt": 0.35089898109436035,
659
+ "DataStreaming": 0.0234222412109375,
660
+ "DeConcat": 0.00548243522644043,
661
+ "DeadCodeElimination": 0.010943174362182617,
662
+ "DeadStoreElimination": 0.037809133529663086,
663
+ "DelinearIndices": 0.028621196746826172,
664
+ "Delinearization": 0.0106201171875,
665
+ "DelinearizeSPMD": 0.029047727584838867,
666
+ "DoNothing": 0.00011301040649414063,
667
+ "DramToDramTranspose": 0.01769733428955078,
668
+ "DumpGraphAndMetadata": 0.013274908065795898,
669
+ "EliminateDivs": 0.006105184555053711,
670
+ "ExpandBatchNorm": 0.0027565956115722656,
671
+ "ExpandISAMacro": 0.01057887077331543,
672
+ "FactorizeBlkDims": 0.06908917427062988,
673
+ "FactorizeThreadAxesInFreeDims": 0.00501704216003418,
674
+ "FlattenMacroLoop": 0.01100611686706543,
675
+ "GenericAccessSimplifier": 0.0046689510345458984,
676
+ "InferInitValue": 0.07929110527038574,
677
+ "InferIntrinsicOnCC": 0.03535032272338867,
678
+ "InferNeuronTensor": 0.07708048820495605,
679
+ "InferNonlocalTensors": 0.09707069396972656,
680
+ "InferPSumTensor": 0.0996854305267334,
681
+ "InferShardAxis": 0.6792669296264648,
682
+ "InferSharedMemLoc": 0.009181737899780273,
683
+ "InlineNativeKernels": 0.0036575794219970703,
684
+ "InsertCoreBarrier": 0.015471458435058594,
685
+ "InsertIOTransposes": 0.04584240913391113,
686
+ "InsertImplicitShardAxisBeforeISel": 0.008542537689208984,
687
+ "InsertLocalTransposes": 0.029177427291870117,
688
+ "InsertOffloadedTransposes": 0.01767134666442871,
689
+ "LICM": 0.007311820983886719,
690
+ "LateLegalizeInst": 0.021373271942138672,
691
+ "LateLegalizePostSplit": 0.013000011444091797,
692
+ "LateLowerReshapeOp": 0.002672910690307617,
693
+ "LateLowerTensorOp": 0.022157907485961914,
694
+ "LateNeuronInstComb": 0.038089752197265625,
695
+ "LayoutPreprocessing": 0.0897824764251709,
696
+ "LayoutPreprocessingAndAnalysis": 0.140883207321167,
697
+ "LayoutRequirementAnalysis": 0.011104106903076172,
698
+ "LegalizeCCOpLayout": 0.0038611888885498047,
699
+ "LegalizeOpLevelAlias": 0.005839109420776367,
700
+ "LegalizePartitionReduce": 0.0055887699127197266,
701
+ "LegalizeSundaAccess": 0.053086042404174805,
702
+ "LegalizeSundaMacro": 0.020623445510864258,
703
+ "LegalizeType": 0.009373188018798828,
704
+ "LocalLayoutOpt": 0.07568526268005371,
705
+ "LoopFusion": 0.03827691078186035,
706
+ "LoopSplitting": 0.0006964206695556641,
707
+ "LowerBroadcast": 0.0038139820098876953,
708
+ "LowerCCOpBlockAxis": 0.015240907669067383,
709
+ "LowerComplexBroadcast": 0.00460052490234375,
710
+ "LowerIntrinsics": 0.06653690338134766,
711
+ "LowerShardAxis": 0.034250497817993164,
712
+ "LowerTensorOp": 0.024506807327270508,
713
+ "LowerToSendRecv": 0.00830531120300293,
714
+ "LowerTranspose": 0.026538848876953125,
715
+ "MacroGeneration": 0.1462860107421875,
716
+ "MaskPropagation": 0.004972219467163086,
717
+ "MemcpyElimination": 0.17155957221984863,
718
+ "MutateDataType": 0.0026092529296875,
719
+ "NeuronAliasDependencyInduction": 0.0009496212005615234,
720
+ "NeuronAliasDependencyReset": 0.029055118560791016,
721
+ "NeuronInstComb": 0.010199785232543945,
722
+ "NeuronLICM": 0.02064967155456543,
723
+ "NeuronLoopFusion": 0.045073747634887695,
724
+ "NeuronLoopInterchange": 0.004991292953491211,
725
+ "NeuronSimplifier": 0.04068398475646973,
726
+ "NeuronSimplifyPredicates": 0.012614011764526367,
727
+ "NeuronValueNumbering": 0.008387327194213867,
728
+ "OptimizeAliasedCopyChain": 0.004460334777832031,
729
+ "OptimizeNKIKernels": 0.3194434642791748,
730
+ "PAGLayoutOpt": 0.48951292037963867,
731
+ "PComputeCutting": 0.014848470687866211,
732
+ "PGLayoutTilingPipeline": 2.5451276302337646,
733
+ "PGTiling": 0.5836856365203857,
734
+ "PadElimination": 0.000995635986328125,
735
+ "ParAxesAnnotation": 0.40463972091674805,
736
+ "PartialLoopFusion": 0.06643557548522949,
737
+ "PartialSimdFusion": 0.13411688804626465,
738
+ "PerfectLoopNest": 0.0027947425842285156,
739
+ "RecognizeOpIdiom": 0.01806020736694336,
740
+ "Recompute": 0.0004432201385498047,
741
+ "RelaxPredicates": 0.009535789489746094,
742
+ "Rematerialization": 0.008739471435546875,
743
+ "RemoveShardedPartitionAxes": 0.0267181396484375,
744
+ "ReshapeWeights": 0.0024602413177490234,
745
+ "ResolveAccessConflict": 0.00865793228149414,
746
+ "ResolveComplicatePredicates": 0.007423877716064453,
747
+ "RewriteReplicationMatmul": 0.003094196319580078,
748
+ "RewriteWeights": 0.008661746978759766,
749
+ "SFKVectorizer": 0.5552070140838623,
750
+ "ShardingPropagationAnalysis": 0.07864713668823242,
751
+ "SimpleAllReduceTiling": 0.009680747985839844,
752
+ "Simplifier": 0.010446548461914063,
753
+ "SimplifyMacroPredicates": 0.012853145599365234,
754
+ "SimplifyNeuronTensor": 0.025235891342163086,
755
+ "SimplifySlice": 0.001861572265625,
756
+ "SimplifyTensor": 0.017523765563964844,
757
+ "SpillPSum": 0.09313821792602539,
758
+ "SplitAPUnionSets": 0.07895660400390625,
759
+ "SplitAccGrp": 0.0044307708740234375,
760
+ "StaticProfiler": 0.014701604843139648,
761
+ "StaticTransposeLocalTensor": 0.008467674255371094,
762
+ "SundaISel": 0.07091832160949707,
763
+ "TCTransform": 0.0018222332000732422,
764
+ "TensorInitialization": 0.008383989334106445,
765
+ "TensorOpSimplifier": 0.013144254684448242,
766
+ "TensorOpTransform": 0.17133593559265137,
767
+ "TileCCOps": 0.018372297286987305,
768
+ "TilingProfiler": 0.022103309631347656,
769
+ "TransformConvOp": 0.00668644905090332,
770
+ "TritiumFusion": 0.25888824462890625,
771
+ "ValueNumbering": 0.00537419319152832,
772
+ "VectorizeDMA": 0.018125534057617188,
773
+ "VectorizeMatMult": 0.04329061508178711,
774
+ "WeightCoalescing": 0.006384849548339844,
775
+ "ZeroSizeTensorElimination": 0.00020265579223632813
776
+ },
777
+ "tensorizer": {
778
+ "DMATilingProfiler::TotalInstructionsAfterTiling": 16532.0,
779
+ "StaticProfiler::AifUb": 911.9026489257813,
780
+ "StaticProfiler::ArithmeticIntensityTensorizer": 525.61767578125,
781
+ "StaticProfiler::AverageDmaLength": 2890.18798828125,
782
+ "StaticProfiler::AverageFractalPeUtilization": 100.0,
783
+ "StaticProfiler::AveragePartitionUtilization": 99.69086456298828,
784
+ "StaticProfiler::AveragePeUtilization": 100.0,
785
+ "StaticProfiler::DDRTransferBytes": 440600576.0,
786
+ "StaticProfiler::InternalTransferBytes": 226492416.0,
787
+ "StaticProfiler::LoadExpanded": 92289.0,
788
+ "StaticProfiler::LocalizationEfficiency": 57.63966751098633,
789
+ "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 87.69190979003906,
790
+ "StaticProfiler::StoreExpanded": 26625.0,
791
+ "StaticProfiler::TotalDMAExpanded": 118914.0,
792
+ "StaticProfiler::TotalDynamicInstancesCount": 22001.0,
793
+ "StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 22001.0,
794
+ "StaticProfiler::TotalLNCComm": 0.0,
795
+ "StaticProfiler::TotalLNCCommTransfer": 0.0,
796
+ "TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0,
797
+ "TilingProfiler::AveragePeUtilizationAfterTiling": 0.0,
798
+ "TilingProfiler::BatchnormInstructionsAfterTiling": 0.0,
799
+ "TilingProfiler::DmaInstructionsAfterTiling": 0.0,
800
+ "TilingProfiler::GenericInstructionsAfterTiling": 256.0,
801
+ "TilingProfiler::MatMultInstructionsAfterTiling": 12288.0,
802
+ "TilingProfiler::NumPfTransposes": 8.0,
803
+ "TilingProfiler::NumPfTransposesForIo": 3.0,
804
+ "TilingProfiler::NumPfTransposesForLocal": 3.0,
805
+ "TilingProfiler::NumPfTransposesForNonlocal": 2.0,
806
+ "TilingProfiler::PfTransposeInstructions": 1984.0,
807
+ "TilingProfiler::PfTransposeInstructionsForIo": 576.0,
808
+ "TilingProfiler::PfTransposeInstructionsForLocal": 384.0,
809
+ "TilingProfiler::PfTransposeInstructionsForNonlocal": 1024.0,
810
+ "TilingProfiler::ReduceInstructionsAfterTiling": 0.0,
811
+ "TilingProfiler::SimdInstructionsAfterTiling": 1188.0,
812
+ "TilingProfiler::TotalInstructionsAfterTiling": 0.0,
813
+ "TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0,
814
+ "TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0,
815
+ "TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0,
816
+ "TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0,
817
+ "TransformConvOp::conv2d_column_packing": 0.0,
818
+ "TransformConvOp::conv2d_column_packing_1": 0.0,
819
+ "TransformConvOp::conv2d_column_packing_io10": 0.0,
820
+ "TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0
821
+ }
822
+ },
823
+ "sg0002": {
824
+ "compiletime": {
825
+ "AGOrderingAnalysisPass": 0.03893709182739258,
826
+ "AffinePredicateResolution": 0.00975942611694336,
827
+ "AliasDependencyElimination": 0.00020766258239746094,
828
+ "AliasDependencyInduction": 0.014848947525024414,
829
+ "AliasDependencyReset": 0.0507814884185791,
830
+ "BFComputeCutting": 0.004155397415161133,
831
+ "BirCodeGenLoop": 0.384446382522583,
832
+ "CCOpFusion": 0.11220550537109375,
833
+ "CanonicalizeDAGForPGTiling": 0.013774633407592773,
834
+ "CanonicalizeIR": 0.002764463424682617,
835
+ "CoalesceCCOp": 0.003862142562866211,
836
+ "CommuteConcat": 0.0019075870513916016,
837
+ "DMALocalityOpt": 0.0027344226837158203,
838
+ "DMAProfiler": 0.009855031967163086,
839
+ "DMATilingProfiler": 0.007188081741333008,
840
+ "DataLocalityOpt": 0.15634822845458984,
841
+ "DataStreaming": 0.008760213851928711,
842
+ "DeConcat": 0.0020532608032226563,
843
+ "DeadCodeElimination": 0.002146482467651367,
844
+ "DeadStoreElimination": 0.024139404296875,
845
+ "DelinearIndices": 0.013254880905151367,
846
+ "Delinearization": 0.007935047149658203,
847
+ "DelinearizeSPMD": 0.023029565811157227,
848
+ "DoNothing": 0.0001049041748046875,
849
+ "DramToDramTranspose": 0.012213945388793945,
850
+ "DumpGraphAndMetadata": 0.03455543518066406,
851
+ "EliminateDivs": 0.01893448829650879,
852
+ "ExpandBatchNorm": 0.007169485092163086,
853
+ "ExpandISAMacro": 0.007604122161865234,
854
+ "FactorizeBlkDims": 0.023853540420532227,
855
+ "FactorizeThreadAxesInFreeDims": 0.0075495243072509766,
856
+ "FlattenMacroLoop": 0.007609844207763672,
857
+ "GenericAccessSimplifier": 0.0013933181762695313,
858
+ "InferInitValue": 0.10064125061035156,
859
+ "InferIntrinsicOnCC": 0.026311159133911133,
860
+ "InferNeuronTensor": 0.05008339881896973,
861
+ "InferNonlocalTensors": 0.05733203887939453,
862
+ "InferPSumTensor": 0.0887153148651123,
863
+ "InferShardAxis": 0.6304898262023926,
864
+ "InferSharedMemLoc": 0.03429460525512695,
865
+ "InlineNativeKernels": 0.00394749641418457,
866
+ "InsertCoreBarrier": 0.009274959564208984,
867
+ "InsertIOTransposes": 0.04183030128479004,
868
+ "InsertImplicitShardAxisBeforeISel": 0.01711416244506836,
869
+ "InsertLocalTransposes": 0.0077512264251708984,
870
+ "InsertOffloadedTransposes": 0.010181665420532227,
871
+ "LICM": 0.005186319351196289,
872
+ "LateLegalizeInst": 0.015667200088500977,
873
+ "LateLegalizePostSplit": 0.03845643997192383,
874
+ "LateLowerReshapeOp": 0.0019919872283935547,
875
+ "LateLowerTensorOp": 0.0022301673889160156,
876
+ "LateNeuronInstComb": 0.018993377685546875,
877
+ "LayoutPreprocessing": 0.05747699737548828,
878
+ "LayoutPreprocessingAndAnalysis": 0.09093403816223145,
879
+ "LayoutRequirementAnalysis": 0.010792970657348633,
880
+ "LegalizeCCOpLayout": 0.0032892227172851563,
881
+ "LegalizeOpLevelAlias": 0.0013661384582519531,
882
+ "LegalizePartitionReduce": 0.006167411804199219,
883
+ "LegalizeSundaAccess": 0.03937268257141113,
884
+ "LegalizeSundaMacro": 0.051756858825683594,
885
+ "LegalizeType": 0.023316621780395508,
886
+ "LocalLayoutOpt": 0.021276235580444336,
887
+ "LoopFusion": 0.006464719772338867,
888
+ "LoopSplitting": 0.0007054805755615234,
889
+ "LowerBroadcast": 0.011565208435058594,
890
+ "LowerCCOpBlockAxis": 0.008892297744750977,
891
+ "LowerComplexBroadcast": 0.0035398006439208984,
892
+ "LowerIntrinsics": 0.04290151596069336,
893
+ "LowerShardAxis": 0.04483389854431152,
894
+ "LowerTensorOp": 0.025528907775878906,
895
+ "LowerToSendRecv": 0.04537153244018555,
896
+ "LowerTranspose": 0.024749279022216797,
897
+ "MacroGeneration": 0.08503556251525879,
898
+ "MaskPropagation": 0.007714748382568359,
899
+ "MemcpyElimination": 0.062020301818847656,
900
+ "MutateDataType": 0.0020122528076171875,
901
+ "NeuronAliasDependencyInduction": 0.0006520748138427734,
902
+ "NeuronAliasDependencyReset": 0.10503625869750977,
903
+ "NeuronInstComb": 0.026773691177368164,
904
+ "NeuronLICM": 0.03244495391845703,
905
+ "NeuronLoopFusion": 0.05422854423522949,
906
+ "NeuronLoopInterchange": 0.0029349327087402344,
907
+ "NeuronSimplifier": 0.026484966278076172,
908
+ "NeuronSimplifyPredicates": 0.02537679672241211,
909
+ "NeuronValueNumbering": 0.005478858947753906,
910
+ "OptimizeAliasedCopyChain": 0.0018880367279052734,
911
+ "OptimizeNKIKernels": 4.115047454833984,
912
+ "PAGLayoutOpt": 0.11529350280761719,
913
+ "PComputeCutting": 0.010918140411376953,
914
+ "PGLayoutTilingPipeline": 1.6512439250946045,
915
+ "PGTiling": 0.2841973304748535,
916
+ "PadElimination": 0.0008590221405029297,
917
+ "ParAxesAnnotation": 0.07899093627929688,
918
+ "PartialLoopFusion": 0.03534102439880371,
919
+ "PartialSimdFusion": 0.021408557891845703,
920
+ "PerfectLoopNest": 0.008621454238891602,
921
+ "RecognizeOpIdiom": 0.010253190994262695,
922
+ "Recompute": 0.0005791187286376953,
923
+ "RelaxPredicates": 0.013797521591186523,
924
+ "Rematerialization": 0.0054569244384765625,
925
+ "RemoveShardedPartitionAxes": 0.03261446952819824,
926
+ "ReshapeWeights": 0.001524209976196289,
927
+ "ResolveAccessConflict": 0.019870281219482422,
928
+ "ResolveComplicatePredicates": 0.0053920745849609375,
929
+ "RewriteReplicationMatmul": 0.0025107860565185547,
930
+ "RewriteWeights": 0.009802579879760742,
931
+ "SFKVectorizer": 0.240997314453125,
932
+ "ShardingPropagationAnalysis": 0.10757136344909668,
933
+ "SimpleAllReduceTiling": 0.0035986900329589844,
934
+ "Simplifier": 0.005366325378417969,
935
+ "SimplifyMacroPredicates": 0.016243934631347656,
936
+ "SimplifyNeuronTensor": 0.016655683517456055,
937
+ "SimplifySlice": 0.002231597900390625,
938
+ "SimplifyTensor": 0.017529726028442383,
939
+ "SpillPSum": 0.03337574005126953,
940
+ "SplitAPUnionSets": 0.15779972076416016,
941
+ "SplitAccGrp": 0.005539894104003906,
942
+ "StaticProfiler": 0.046514272689819336,
943
+ "StaticTransposeLocalTensor": 0.008464574813842773,
944
+ "SundaISel": 0.07130837440490723,
945
+ "TCTransform": 0.002462148666381836,
946
+ "TensorInitialization": 0.011480093002319336,
947
+ "TensorOpSimplifier": 0.008947134017944336,
948
+ "TensorOpTransform": 0.06947088241577148,
949
+ "TileCCOps": 0.012774467468261719,
950
+ "TilingProfiler": 0.014863967895507813,
951
+ "TransformConvOp": 0.006424665451049805,
952
+ "TritiumFusion": 0.11082077026367188,
953
+ "ValueNumbering": 0.0049648284912109375,
954
+ "VectorizeDMA": 0.004624843597412109,
955
+ "VectorizeMatMult": 0.028928518295288086,
956
+ "WeightCoalescing": 0.003192901611328125,
957
+ "ZeroSizeTensorElimination": 0.00021529197692871094
958
+ },
959
+ "tensorizer": {
960
+ "DMATilingProfiler::TotalInstructionsAfterTiling": 31232.0,
961
+ "StaticProfiler::AifUb": 538.6357421875,
962
+ "StaticProfiler::ArithmeticIntensityTensorizer": 306.53076171875,
963
+ "StaticProfiler::AverageDmaLength": 2517.368896484375,
964
+ "StaticProfiler::AverageFractalPeUtilization": 99.12728881835938,
965
+ "StaticProfiler::AveragePartitionUtilization": 95.96998596191406,
966
+ "StaticProfiler::AveragePeUtilization": 97.68225860595703,
967
+ "StaticProfiler::DDRTransferBytes": 672177216.0,
968
+ "StaticProfiler::InternalTransferBytes": 407820064.0,
969
+ "StaticProfiler::LoadExpanded": 189029.0,
970
+ "StaticProfiler::LocalizationEfficiency": 56.908729553222656,
971
+ "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 63.73067855834961,
972
+ "StaticProfiler::StoreExpanded": 13673.0,
973
+ "StaticProfiler::TotalDMAExpanded": 202702.0,
974
+ "StaticProfiler::TotalDynamicInstancesCount": 37700.0,
975
+ "StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 37249.0,
976
+ "StaticProfiler::TotalLNCComm": 0.0,
977
+ "StaticProfiler::TotalLNCCommTransfer": 0.0,
978
+ "TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0,
979
+ "TilingProfiler::AveragePeUtilizationAfterTiling": 0.0,
980
+ "TilingProfiler::BatchnormInstructionsAfterTiling": 0.0,
981
+ "TilingProfiler::DmaInstructionsAfterTiling": 0.0,
982
+ "TilingProfiler::GenericInstructionsAfterTiling": 4.0,
983
+ "TilingProfiler::MatMultInstructionsAfterTiling": 18720.0,
984
+ "TilingProfiler::NumPfTransposes": 5.0,
985
+ "TilingProfiler::NumPfTransposesForIo": 1.0,
986
+ "TilingProfiler::NumPfTransposesForLocal": 1.0,
987
+ "TilingProfiler::NumPfTransposesForNonlocal": 3.0,
988
+ "TilingProfiler::PfTransposeInstructions": 11041.0,
989
+ "TilingProfiler::PfTransposeInstructionsForIo": 9504.0,
990
+ "TilingProfiler::PfTransposeInstructionsForLocal": 1.0,
991
+ "TilingProfiler::PfTransposeInstructionsForNonlocal": 1536.0,
992
+ "TilingProfiler::ReduceInstructionsAfterTiling": 18.0,
993
+ "TilingProfiler::SimdInstructionsAfterTiling": 604.0,
994
+ "TilingProfiler::TotalInstructionsAfterTiling": 0.0,
995
+ "TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0,
996
+ "TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0,
997
+ "TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0,
998
+ "TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0,
999
+ "TransformConvOp::conv2d_column_packing": 0.0,
1000
+ "TransformConvOp::conv2d_column_packing_1": 0.0,
1001
+ "TransformConvOp::conv2d_column_packing_io10": 0.0,
1002
+ "TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0
1003
+ }
1004
+ },
1005
+ "sg01": {
1006
+ "compiletime": {
1007
+ "CanonicalizeConv": 6.000000212225132e-06,
1008
+ "CanonicalizeForTensorizer": 1.2000000424450263e-05,
1009
+ "Canonicalizer": 0.00033099998836405575,
1010
+ "HoistCompute": 1.9999999949504854e-06,
1011
+ "IdentifyCrossPassTensors": 1.4000000192027073e-05,
1012
+ "MemcastMotion": 6.000000212225132e-06,
1013
+ "PenguinizeFunctions": 1.2000000424450263e-05,
1014
+ "PruneFunctions": 1.2999999853491317e-05,
1015
+ "RemoveOptimizationBarriers": 1.8999999156221747e-05,
1016
+ "ScatterMotion": 1.4000000192027073e-05,
1017
+ "TensorizerLegalizationPass": 1.700000029813964e-05,
1018
+ "VerifySupportedOps": 1.4999999621068127e-05,
1019
+ "algsimp": 5.6000000768108293e-05,
1020
+ "batchnorm_expander": 1.2000000424450263e-05,
1021
+ "boundary-marker-removal": 3.999999989900971e-06,
1022
+ "call-inliner": 1.1000000085914508e-05,
1023
+ "canonicalize-boundary-marker": 4.999999873689376e-06,
1024
+ "collective-stream-id-checker": 3.000000106112566e-06,
1025
+ "comparison-expander": 4.999999873689376e-06,
1026
+ "computation-deduplicator": 1.9999999494757503e-05,
1027
+ "config-lowering": 3.600000127335079e-05,
1028
+ "constant_folding": 9.000000318337698e-06,
1029
+ "cse": 1.2000000424450263e-05,
1030
+ "dce": 9.999999974752427e-07,
1031
+ "dynamic-slice-transpose": 3.999999989900971e-06,
1032
+ "eliminate-redundant-compare": 3.999999989900971e-06,
1033
+ "emit-offloaded-dropout": 1.4000000192027073e-05,
1034
+ "flatten-call-graph": 6.000000212225132e-06,
1035
+ "fuse-send-recv": 2.2000000171829015e-05,
1036
+ "hilo-conditional-to-select": 4.999999873689376e-06,
1037
+ "hilo::LegalizeAlias": 3.999999989900971e-06,
1038
+ "hilo::NeuronInstCombine": 4.70000013592653e-05,
1039
+ "hilo::NeuronOpFusion": 7.999999979801942e-06,
1040
+ "hilo::ReplaceTokenTypeWithU8Pass": 1.4999999621068127e-05,
1041
+ "hilo::ScheduleFusion": 9.999999974752427e-07,
1042
+ "hilo::SixtyFourHack": 9.999999747378752e-06,
1043
+ "hilo::VerifyAliasing": 1.9999999949504854e-06,
1044
+ "hlo-mac-count": 8.800000068731606e-05,
1045
+ "legalize-ccops-for-tensorizer": 9.999999974752427e-07,
1046
+ "legalize-compare": 3.999999989900971e-06,
1047
+ "lower-argminmax-custom-call": 3.999999989900971e-06,
1048
+ "map-inline": 9.999999747378752e-06,
1049
+ "metadata-naming": 1.700000029813964e-05,
1050
+ "mlir::detail::OpToOpPassAdaptor": 1.700000029813964e-05,
1051
+ "mlir::hlo::MhloToPyPenguin": 0.0024689999409019947,
1052
+ "mlir::mhlo::LowerComplexExtraPass": 0.00012599999899975955,
1053
+ "mlir::mhlo::LowerComplexPass": 0.0001630000042496249,
1054
+ "native-to-custom-softmax": 4.999999873689376e-06,
1055
+ "native-to-custom-softmax-dx": 1.2999999853491317e-05,
1056
+ "neuron-hlo-verifier": 0.00035600000410340726,
1057
+ "operand_upcaster": 2.5999999706982635e-05,
1058
+ "post-par-pipe-begin": 9.999999974752427e-07,
1059
+ "post-par-pipe-end": 0.0,
1060
+ "post-partition-simplification": 0.0005549999768845737,
1061
+ "replace-minimum-constant": 6.000000212225132e-06,
1062
+ "reshape-mover": 3.000000106112566e-06,
1063
+ "simplify-concat": 4.199999966658652e-05,
1064
+ "simplify-while-loops": 1.9999999949504854e-06,
1065
+ "transform-variadic-reduce": 7.000000096013537e-06,
1066
+ "tuple-simplifier": 4.999999873689376e-06,
1067
+ "unpack-nested-aws-ntwsr": 3.000000106112566e-06,
1068
+ "unroll-while-loop": 0.0
1069
+ },
1070
+ "hilo": {
1071
+ "ArithmeticIntensity": 1091.5736083984375,
1072
+ "HloMacCount": 120259084288.0,
1073
+ "Traffic": 220340768.0
1074
+ }
1075
+ },
1076
+ "sg02": {
1077
+ "compiletime": {
1078
+ "CanonicalizeConv": 0.0,
1079
+ "CanonicalizeForTensorizer": 1.9999999494757503e-05,
1080
+ "Canonicalizer": 0.0002680000034160912,
1081
+ "HoistCompute": 0.0,
1082
+ "IdentifyCrossPassTensors": 1.1000000085914508e-05,
1083
+ "MemcastMotion": 6.000000212225132e-06,
1084
+ "PenguinizeFunctions": 1.700000029813964e-05,
1085
+ "PruneFunctions": 3.099999958067201e-05,
1086
+ "RemoveOptimizationBarriers": 9.000000318337698e-06,
1087
+ "ScatterMotion": 7.999999979801942e-06,
1088
+ "TensorizerLegalizationPass": 1.1000000085914508e-05,
1089
+ "VerifySupportedOps": 1.2000000424450263e-05,
1090
+ "algsimp": 4.8000001697801054e-05,
1091
+ "batchnorm_expander": 1.1000000085914508e-05,
1092
+ "boundary-marker-removal": 3.000000106112566e-06,
1093
+ "call-inliner": 9.999999747378752e-06,
1094
+ "canonicalize-boundary-marker": 3.999999989900971e-06,
1095
+ "collective-stream-id-checker": 3.000000106112566e-06,
1096
+ "comparison-expander": 4.999999873689376e-06,
1097
+ "computation-deduplicator": 1.8999999156221747e-05,
1098
+ "config-lowering": 3.9999998989515007e-05,
1099
+ "constant_folding": 7.999999979801942e-06,
1100
+ "cse": 1.2000000424450263e-05,
1101
+ "dce": 9.999999974752427e-07,
1102
+ "dynamic-slice-transpose": 3.000000106112566e-06,
1103
+ "eliminate-redundant-compare": 3.000000106112566e-06,
1104
+ "emit-offloaded-dropout": 9.999999747378752e-06,
1105
+ "flatten-call-graph": 9.000000318337698e-06,
1106
+ "fuse-send-recv": 1.5999999959603883e-05,
1107
+ "hilo-conditional-to-select": 4.999999873689376e-06,
1108
+ "hilo::LegalizeAlias": 3.000000106112566e-06,
1109
+ "hilo::NeuronInstCombine": 2.4000000848900527e-05,
1110
+ "hilo::NeuronOpFusion": 1.9999999949504854e-06,
1111
+ "hilo::ReplaceTokenTypeWithU8Pass": 7.000000096013537e-06,
1112
+ "hilo::ScheduleFusion": 1.9999999949504854e-06,
1113
+ "hilo::SixtyFourHack": 7.300000288523734e-05,
1114
+ "hilo::VerifyAliasing": 9.999999974752427e-07,
1115
+ "hlo-mac-count": 0.005158000160008669,
1116
+ "legalize-ccops-for-tensorizer": 9.999999974752427e-07,
1117
+ "legalize-compare": 3.000000106112566e-06,
1118
+ "lower-argminmax-custom-call": 3.000000106112566e-06,
1119
+ "map-inline": 1.1000000085914508e-05,
1120
+ "metadata-naming": 1.4000000192027073e-05,
1121
+ "mlir::detail::OpToOpPassAdaptor": 2.099999983329326e-05,
1122
+ "mlir::hlo::MhloToPyPenguin": 0.007327999919652939,
1123
+ "mlir::mhlo::LowerComplexExtraPass": 7.899999764049426e-05,
1124
+ "mlir::mhlo::LowerComplexPass": 4.70000013592653e-05,
1125
+ "native-to-custom-softmax": 4.999999873689376e-06,
1126
+ "native-to-custom-softmax-dx": 1.700000029813964e-05,
1127
+ "neuron-hlo-verifier": 0.0003319999959785491,
1128
+ "operand_upcaster": 1.2999999853491317e-05,
1129
+ "post-par-pipe-begin": 9.999999974752427e-07,
1130
+ "post-par-pipe-end": 0.0,
1131
+ "post-partition-simplification": 0.00046999999904073775,
1132
+ "replace-minimum-constant": 7.000000096013537e-06,
1133
+ "reshape-mover": 1.9999999949504854e-06,
1134
+ "simplify-concat": 3.199999991920777e-05,
1135
+ "simplify-while-loops": 1.9999999949504854e-06,
1136
+ "transform-variadic-reduce": 4.5000000682193786e-05,
1137
+ "tuple-simplifier": 3.999999989900971e-06,
1138
+ "unpack-nested-aws-ntwsr": 3.999999989900971e-06,
1139
+ "unroll-while-loop": 0.0
1140
+ },
1141
+ "hilo": {
1142
+ "ArithmeticIntensity": 387.7274169921875,
1143
+ "HloMacCount": 77466042368.0,
1144
+ "Traffic": 399590208.0
1145
+ }
1146
+ },
1147
+ "topk": {
1148
+ "compiletime": {
1149
+ "CoalesceCCOp": 0.014192342758178711,
1150
+ "DMALocalityOpt": 0.00689697265625,
1151
+ "DMAProfiler": 0.01308584213256836,
1152
+ "DataStreaming": 0.022514820098876953,
1153
+ "DoNothing": 0.0002422332763671875,
1154
+ "ExpandISAMacro": 0.01119232177734375,
1155
+ "FactorizeBlkDims": 0.05026698112487793,
1156
+ "InferPSumTensor": 0.032309532165527344,
1157
+ "InferSharedMemLoc": 0.008169412612915039,
1158
+ "InsertCoreBarrier": 0.008690834045410156,
1159
+ "LateLegalizeInst": 0.02731013298034668,
1160
+ "LateNeuronInstComb": 0.029446125030517578,
1161
+ "LegalizeSundaAccess": 0.05955004692077637,
1162
+ "LegalizeType": 0.04967856407165527,
1163
+ "LowerBroadcast": 0.0077512264251708984,
1164
+ "LowerIntrinsics": 0.007628440856933594,
1165
+ "LowerTranspose": 0.015612125396728516,
1166
+ "NeuronInstComb": 0.029858112335205078,
1167
+ "NeuronLICM": 0.02176380157470703,
1168
+ "NeuronSimplifyPredicates": 0.008015632629394531,
1169
+ "NeuronValueNumbering": 0.015556097030639648,
1170
+ "SFKVectorizer": 0.10406112670898438,
1171
+ "SimpleAllReduceTiling": 0.01195383071899414,
1172
+ "SimplifyNeuronTensor": 0.15082645416259766,
1173
+ "SpillPSum": 0.17061901092529297,
1174
+ "WeightCoalescing": 0.009498357772827148
1175
+ }
1176
+ }
1177
+ }
context_encoding_model/_tp0_bk5/graph.neff ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:56e28f3613a7ada8c1d580c4a0d3979da6436bd82072a724c52018668343c286
3
+ size 3062784