Upload folder using huggingface_hub
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +27 -0
- context_encoding_model/_tp0_bk0/command.txt +1 -0
- context_encoding_model/_tp0_bk0/compile_flags.MODULE_e7e1b6c43bb87ca73ecc+2ee9f01d.json +1 -0
- context_encoding_model/_tp0_bk0/global_metric_store.json +1147 -0
- context_encoding_model/_tp0_bk0/graph.neff +3 -0
- context_encoding_model/_tp0_bk0/log-neuron-cc.txt +0 -0
- context_encoding_model/_tp0_bk0/metaneff.pb +3 -0
- context_encoding_model/_tp0_bk0/model.MODULE_e7e1b6c43bb87ca73ecc+2ee9f01d.hlo_module.pb +3 -0
- context_encoding_model/_tp0_bk0/model.MODULE_e7e1b6c43bb87ca73ecc+2ee9f01d.neff +3 -0
- context_encoding_model/_tp0_bk0/neuron_config.json +224 -0
- context_encoding_model/_tp0_bk1/command.txt +1 -0
- context_encoding_model/_tp0_bk1/compile_flags.MODULE_2330bfb0632c950ddab1+62ecd68b.json +1 -0
- context_encoding_model/_tp0_bk1/global_metric_store.json +1177 -0
- context_encoding_model/_tp0_bk1/graph.neff +3 -0
- context_encoding_model/_tp0_bk1/log-neuron-cc.txt +0 -0
- context_encoding_model/_tp0_bk1/metaneff.pb +3 -0
- context_encoding_model/_tp0_bk1/model.MODULE_2330bfb0632c950ddab1+62ecd68b.hlo_module.pb +3 -0
- context_encoding_model/_tp0_bk1/model.MODULE_2330bfb0632c950ddab1+62ecd68b.neff +3 -0
- context_encoding_model/_tp0_bk1/neuron_config.json +224 -0
- context_encoding_model/_tp0_bk2/command.txt +1 -0
- context_encoding_model/_tp0_bk2/compile_flags.MODULE_49bb42f69f5b159ae769+3467f95e.json +1 -0
- context_encoding_model/_tp0_bk2/global_metric_store.json +1177 -0
- context_encoding_model/_tp0_bk2/graph.neff +3 -0
- context_encoding_model/_tp0_bk2/log-neuron-cc.txt +0 -0
- context_encoding_model/_tp0_bk2/metaneff.pb +3 -0
- context_encoding_model/_tp0_bk2/model.MODULE_49bb42f69f5b159ae769+3467f95e.hlo_module.pb +3 -0
- context_encoding_model/_tp0_bk2/model.MODULE_49bb42f69f5b159ae769+3467f95e.neff +3 -0
- context_encoding_model/_tp0_bk2/neuron_config.json +224 -0
- context_encoding_model/_tp0_bk3/command.txt +1 -0
- context_encoding_model/_tp0_bk3/compile_flags.MODULE_be035899334776123ed5+d208bdce.json +1 -0
- context_encoding_model/_tp0_bk3/global_metric_store.json +1177 -0
- context_encoding_model/_tp0_bk3/graph.neff +3 -0
- context_encoding_model/_tp0_bk3/log-neuron-cc.txt +0 -0
- context_encoding_model/_tp0_bk3/metaneff.pb +3 -0
- context_encoding_model/_tp0_bk3/model.MODULE_be035899334776123ed5+d208bdce.hlo_module.pb +3 -0
- context_encoding_model/_tp0_bk3/model.MODULE_be035899334776123ed5+d208bdce.neff +3 -0
- context_encoding_model/_tp0_bk3/neuron_config.json +224 -0
- context_encoding_model/_tp0_bk4/command.txt +1 -0
- context_encoding_model/_tp0_bk4/compile_flags.MODULE_95ef7ca73cc0a6161be2+96be3c33.json +1 -0
- context_encoding_model/_tp0_bk4/global_metric_store.json +1177 -0
- context_encoding_model/_tp0_bk4/graph.neff +3 -0
- context_encoding_model/_tp0_bk4/log-neuron-cc.txt +0 -0
- context_encoding_model/_tp0_bk4/metaneff.pb +3 -0
- context_encoding_model/_tp0_bk4/model.MODULE_95ef7ca73cc0a6161be2+96be3c33.hlo_module.pb +3 -0
- context_encoding_model/_tp0_bk4/model.MODULE_95ef7ca73cc0a6161be2+96be3c33.neff +3 -0
- context_encoding_model/_tp0_bk4/neuron_config.json +224 -0
- context_encoding_model/_tp0_bk5/command.txt +1 -0
- context_encoding_model/_tp0_bk5/compile_flags.MODULE_96a8f4e12dc810958634+b1e26cef.json +1 -0
- context_encoding_model/_tp0_bk5/global_metric_store.json +1177 -0
- context_encoding_model/_tp0_bk5/graph.neff +3 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,30 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
context_encoding_model/_tp0_bk0/graph.neff filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
context_encoding_model/_tp0_bk0/model.MODULE_e7e1b6c43bb87ca73ecc+2ee9f01d.neff filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
context_encoding_model/_tp0_bk1/graph.neff filter=lfs diff=lfs merge=lfs -text
|
| 39 |
+
context_encoding_model/_tp0_bk1/model.MODULE_2330bfb0632c950ddab1+62ecd68b.neff filter=lfs diff=lfs merge=lfs -text
|
| 40 |
+
context_encoding_model/_tp0_bk2/graph.neff filter=lfs diff=lfs merge=lfs -text
|
| 41 |
+
context_encoding_model/_tp0_bk2/model.MODULE_49bb42f69f5b159ae769+3467f95e.neff filter=lfs diff=lfs merge=lfs -text
|
| 42 |
+
context_encoding_model/_tp0_bk3/graph.neff filter=lfs diff=lfs merge=lfs -text
|
| 43 |
+
context_encoding_model/_tp0_bk3/model.MODULE_be035899334776123ed5+d208bdce.neff filter=lfs diff=lfs merge=lfs -text
|
| 44 |
+
context_encoding_model/_tp0_bk4/graph.neff filter=lfs diff=lfs merge=lfs -text
|
| 45 |
+
context_encoding_model/_tp0_bk4/model.MODULE_95ef7ca73cc0a6161be2+96be3c33.neff filter=lfs diff=lfs merge=lfs -text
|
| 46 |
+
context_encoding_model/_tp0_bk5/graph.neff filter=lfs diff=lfs merge=lfs -text
|
| 47 |
+
context_encoding_model/_tp0_bk5/model.MODULE_96a8f4e12dc810958634+b1e26cef.neff filter=lfs diff=lfs merge=lfs -text
|
| 48 |
+
layout_opt/graph.neff filter=lfs diff=lfs merge=lfs -text
|
| 49 |
+
layout_opt/model/graph.hlo filter=lfs diff=lfs merge=lfs -text
|
| 50 |
+
token_generation_model/_tp0_bk0/graph.neff filter=lfs diff=lfs merge=lfs -text
|
| 51 |
+
token_generation_model/_tp0_bk0/model.MODULE_caeca0352a0240106f96+d5490f71.neff filter=lfs diff=lfs merge=lfs -text
|
| 52 |
+
token_generation_model/_tp0_bk0/wrapped_neff.hlo filter=lfs diff=lfs merge=lfs -text
|
| 53 |
+
token_generation_model/_tp0_bk1/graph.neff filter=lfs diff=lfs merge=lfs -text
|
| 54 |
+
token_generation_model/_tp0_bk1/model.MODULE_122f32d499d16ac150a0+bdebe6e1.neff filter=lfs diff=lfs merge=lfs -text
|
| 55 |
+
token_generation_model/_tp0_bk2/graph.neff filter=lfs diff=lfs merge=lfs -text
|
| 56 |
+
token_generation_model/_tp0_bk2/model.MODULE_bac42b9b464c64624582+1ea12800.neff filter=lfs diff=lfs merge=lfs -text
|
| 57 |
+
token_generation_model/_tp0_bk3/graph.neff filter=lfs diff=lfs merge=lfs -text
|
| 58 |
+
token_generation_model/_tp0_bk3/model.MODULE_8aa2bc135acfce1f4a61+bd0ab490.neff filter=lfs diff=lfs merge=lfs -text
|
| 59 |
+
token_generation_model/_tp0_bk4/graph.neff filter=lfs diff=lfs merge=lfs -text
|
| 60 |
+
token_generation_model/_tp0_bk4/model.MODULE_ec05e5a8222761962028+3b7d8ecf.neff filter=lfs diff=lfs merge=lfs -text
|
| 61 |
+
token_generation_model/_tp0_bk5/graph.neff filter=lfs diff=lfs merge=lfs -text
|
| 62 |
+
token_generation_model/_tp0_bk5/model.MODULE_b0c5e51af4aeb4ea04b2+a0432539.neff filter=lfs diff=lfs merge=lfs -text
|
context_encoding_model/_tp0_bk0/command.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
neuronx-cc compile --framework=XLA model.MODULE_e7e1b6c43bb87ca73ecc+2ee9f01d.hlo_module.pb --output model.MODULE_e7e1b6c43bb87ca73ecc+2ee9f01d.neff --target=trn2 --auto-cast=none --model-type=transformer '--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma ' --lnc=2 -O1 '--internal-hlo2tensorizer-options= --modular-flow-mac-threshold=10 --verify-hlo=true' --logfile=log-neuron-cc.txt --verbose=35
|
context_encoding_model/_tp0_bk0/compile_flags.MODULE_e7e1b6c43bb87ca73ecc+2ee9f01d.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
["--target=trn2", "--auto-cast=none", "--model-type=transformer", "--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma ", "--lnc=2", "-O1", "--internal-hlo2tensorizer-options= --modular-flow-mac-threshold=10 --verify-hlo=true", "--logfile=/home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk0/log-neuron-cc.txt"]
|
context_encoding_model/_tp0_bk0/global_metric_store.json
ADDED
|
@@ -0,0 +1,1147 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"Average": {
|
| 3 |
+
"tensorizer": {
|
| 4 |
+
"StaticProfiler::AverageFractalPeUtilization": 98.70232391357422,
|
| 5 |
+
"StaticProfiler::AveragePartitionUtilization": 94.02606201171875,
|
| 6 |
+
"StaticProfiler::AveragePeUtilization": 96.57791900634766,
|
| 7 |
+
"StaticProfiler::LocalizationEfficiency": 96.75444030761719,
|
| 8 |
+
"StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 99.23246002197266,
|
| 9 |
+
"TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0,
|
| 10 |
+
"TilingProfiler::AveragePeUtilizationAfterTiling": 0.0
|
| 11 |
+
}
|
| 12 |
+
},
|
| 13 |
+
"Count": {
|
| 14 |
+
"tensorizer": {
|
| 15 |
+
"StaticProfiler::AverageFractalPeUtilization": 1.0,
|
| 16 |
+
"StaticProfiler::AveragePartitionUtilization": 1.0,
|
| 17 |
+
"StaticProfiler::AveragePeUtilization": 1.0,
|
| 18 |
+
"StaticProfiler::LocalizationEfficiency": 1.0,
|
| 19 |
+
"StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 1.0,
|
| 20 |
+
"TilingProfiler::AveragePartitionUtilizationAfterTiling": 1.0,
|
| 21 |
+
"TilingProfiler::AveragePeUtilizationAfterTiling": 1.0
|
| 22 |
+
}
|
| 23 |
+
},
|
| 24 |
+
"Sum": {
|
| 25 |
+
"compiletime": {
|
| 26 |
+
"AGOrderingAnalysisPass": 0.037471771240234375,
|
| 27 |
+
"AffinePredicateResolution": 0.0048100948333740234,
|
| 28 |
+
"AliasDependencyElimination": 0.0002529621124267578,
|
| 29 |
+
"AliasDependencyInduction": 0.005568504333496094,
|
| 30 |
+
"AliasDependencyReset": 0.11161017417907715,
|
| 31 |
+
"BFComputeCutting": 0.0024290084838867188,
|
| 32 |
+
"BirCodeGenLoop": 0.32352304458618164,
|
| 33 |
+
"CCOpFusion": 0.033486366271972656,
|
| 34 |
+
"CanonicalizeConv": 2.7000001864507794e-05,
|
| 35 |
+
"CanonicalizeDAGForPGTiling": 0.004197120666503906,
|
| 36 |
+
"CanonicalizeForTensorizer": 3.899999865097925e-05,
|
| 37 |
+
"CanonicalizeIR": 0.0025298595428466797,
|
| 38 |
+
"Canonicalizer": 0.00088900001719594,
|
| 39 |
+
"CoalesceCCOp": 0.014135599136352539,
|
| 40 |
+
"CommuteConcat": 0.0018744468688964844,
|
| 41 |
+
"DMALocalityOpt": 0.01189279556274414,
|
| 42 |
+
"DMAProfiler": 0.025990962982177734,
|
| 43 |
+
"DMATilingProfiler": 0.015254497528076172,
|
| 44 |
+
"DataLocalityOpt": 0.1120154857635498,
|
| 45 |
+
"DataStreaming": 0.03728485107421875,
|
| 46 |
+
"DeConcat": 0.0022406578063964844,
|
| 47 |
+
"DeadCodeElimination": 0.0021486282348632813,
|
| 48 |
+
"DeadStoreElimination": 0.0063364505767822266,
|
| 49 |
+
"DelinearIndices": 0.0064697265625,
|
| 50 |
+
"Delinearization": 0.004486560821533203,
|
| 51 |
+
"DelinearizeSPMD": 0.01732611656188965,
|
| 52 |
+
"DoNothing": 0.0007770061492919922,
|
| 53 |
+
"DramToDramTranspose": 0.02082037925720215,
|
| 54 |
+
"DumpGraphAndMetadata": 0.036411285400390625,
|
| 55 |
+
"EliminateDivs": 0.01006174087524414,
|
| 56 |
+
"ExpandBatchNorm": 0.0024886131286621094,
|
| 57 |
+
"ExpandISAMacro": 0.01822209358215332,
|
| 58 |
+
"FactorizeBlkDims": 0.07448649406433105,
|
| 59 |
+
"FactorizeThreadAxesInFreeDims": 0.0071103572845458984,
|
| 60 |
+
"FlattenMacroLoop": 0.009794235229492188,
|
| 61 |
+
"GenericAccessSimplifier": 0.0009224414825439453,
|
| 62 |
+
"HoistCompute": 7.000000096013537e-06,
|
| 63 |
+
"IdentifyCrossPassTensors": 3.600000127335079e-05,
|
| 64 |
+
"InferInitValue": 0.12128233909606934,
|
| 65 |
+
"InferIntrinsicOnCC": 0.01005697250366211,
|
| 66 |
+
"InferNeuronTensor": 0.029047489166259766,
|
| 67 |
+
"InferNonlocalTensors": 0.017493009567260742,
|
| 68 |
+
"InferPSumTensor": 0.09335684776306152,
|
| 69 |
+
"InferShardAxis": 0.26027798652648926,
|
| 70 |
+
"InferSharedMemLoc": 0.016659259796142578,
|
| 71 |
+
"InlineNativeKernels": 0.002816915512084961,
|
| 72 |
+
"InsertCoreBarrier": 0.0162966251373291,
|
| 73 |
+
"InsertIOTransposes": 0.019797325134277344,
|
| 74 |
+
"InsertImplicitShardAxisBeforeISel": 0.05061173439025879,
|
| 75 |
+
"InsertLocalTransposes": 0.004299163818359375,
|
| 76 |
+
"InsertOffloadedTransposes": 0.008011579513549805,
|
| 77 |
+
"LICM": 0.009003639221191406,
|
| 78 |
+
"LateLegalizeInst": 0.035849571228027344,
|
| 79 |
+
"LateLegalizePostSplit": 0.013758182525634766,
|
| 80 |
+
"LateLowerReshapeOp": 0.0012693405151367188,
|
| 81 |
+
"LateLowerTensorOp": 0.002027750015258789,
|
| 82 |
+
"LateNeuronInstComb": 0.14670348167419434,
|
| 83 |
+
"LayoutPreprocessing": 0.025156497955322266,
|
| 84 |
+
"LayoutPreprocessingAndAnalysis": 0.06950831413269043,
|
| 85 |
+
"LayoutRequirementAnalysis": 0.0069408416748046875,
|
| 86 |
+
"LegalizeCCOpLayout": 0.003494739532470703,
|
| 87 |
+
"LegalizeOpLevelAlias": 0.0016810894012451172,
|
| 88 |
+
"LegalizePartitionReduce": 0.0026693344116210938,
|
| 89 |
+
"LegalizeSundaAccess": 0.08684325218200684,
|
| 90 |
+
"LegalizeSundaMacro": 0.10486245155334473,
|
| 91 |
+
"LegalizeType": 0.06927132606506348,
|
| 92 |
+
"LocalLayoutOpt": 0.012215137481689453,
|
| 93 |
+
"LoopFusion": 0.0049479007720947266,
|
| 94 |
+
"LoopSplitting": 0.0008144378662109375,
|
| 95 |
+
"LowerBroadcast": 0.019241809844970703,
|
| 96 |
+
"LowerCCOpBlockAxis": 0.0037145614624023438,
|
| 97 |
+
"LowerComplexBroadcast": 0.0070230960845947266,
|
| 98 |
+
"LowerIntrinsics": 0.0899801254272461,
|
| 99 |
+
"LowerShardAxis": 0.020240068435668945,
|
| 100 |
+
"LowerTensorOp": 0.028459787368774414,
|
| 101 |
+
"LowerToSendRecv": 0.02129983901977539,
|
| 102 |
+
"LowerTranspose": 0.06694269180297852,
|
| 103 |
+
"MacroGeneration": 0.03631877899169922,
|
| 104 |
+
"MaskPropagation": 0.004620075225830078,
|
| 105 |
+
"MemcastMotion": 2.89999989036005e-05,
|
| 106 |
+
"MemcpyElimination": 0.04741477966308594,
|
| 107 |
+
"MutateDataType": 0.002264261245727539,
|
| 108 |
+
"NeuronAliasDependencyInduction": 0.002180337905883789,
|
| 109 |
+
"NeuronAliasDependencyReset": 0.08514618873596191,
|
| 110 |
+
"NeuronInstComb": 0.05580711364746094,
|
| 111 |
+
"NeuronLICM": 0.047100067138671875,
|
| 112 |
+
"NeuronLoopFusion": 0.05364656448364258,
|
| 113 |
+
"NeuronLoopInterchange": 0.002526521682739258,
|
| 114 |
+
"NeuronSimplifier": 0.06896662712097168,
|
| 115 |
+
"NeuronSimplifyPredicates": 0.042169809341430664,
|
| 116 |
+
"NeuronValueNumbering": 0.025714874267578125,
|
| 117 |
+
"OptimizeAliasedCopyChain": 0.0007548332214355469,
|
| 118 |
+
"OptimizeNKIKernels": 4.075549602508545,
|
| 119 |
+
"PAGLayoutOpt": 0.1111152172088623,
|
| 120 |
+
"PComputeCutting": 0.005707263946533203,
|
| 121 |
+
"PGLayoutTilingPipeline": 1.204958438873291,
|
| 122 |
+
"PGTiling": 0.4116194248199463,
|
| 123 |
+
"PadElimination": 0.0003600120544433594,
|
| 124 |
+
"ParAxesAnnotation": 0.050878286361694336,
|
| 125 |
+
"PartialLoopFusion": 0.0372469425201416,
|
| 126 |
+
"PartialSimdFusion": 0.021113157272338867,
|
| 127 |
+
"PenguinizeFunctions": 3.199999991920777e-05,
|
| 128 |
+
"PerfectLoopNest": 0.007718086242675781,
|
| 129 |
+
"PruneFunctions": 3.400000059627928e-05,
|
| 130 |
+
"RecognizeOpIdiom": 0.0058002471923828125,
|
| 131 |
+
"Recompute": 0.0017511844635009766,
|
| 132 |
+
"RelaxPredicates": 0.00795745849609375,
|
| 133 |
+
"Rematerialization": 0.0019276142120361328,
|
| 134 |
+
"RemoveOptimizationBarriers": 8.50000069476664e-05,
|
| 135 |
+
"RemoveShardedPartitionAxes": 0.008410930633544922,
|
| 136 |
+
"ReshapeWeights": 0.0063934326171875,
|
| 137 |
+
"ResolveAccessConflict": 0.01411294937133789,
|
| 138 |
+
"ResolveComplicatePredicates": 0.004876375198364258,
|
| 139 |
+
"RewriteReplicationMatmul": 0.0017600059509277344,
|
| 140 |
+
"RewriteWeights": 0.004542827606201172,
|
| 141 |
+
"SFKVectorizer": 0.3233633041381836,
|
| 142 |
+
"ScatterMotion": 5.7999997807201e-05,
|
| 143 |
+
"ShardingPropagationAnalysis": 0.06259655952453613,
|
| 144 |
+
"SimpleAllReduceTiling": 0.010744571685791016,
|
| 145 |
+
"Simplifier": 0.0033507347106933594,
|
| 146 |
+
"SimplifyMacroPredicates": 0.056143999099731445,
|
| 147 |
+
"SimplifyNeuronTensor": 0.1345655918121338,
|
| 148 |
+
"SimplifySlice": 0.001861572265625,
|
| 149 |
+
"SimplifyTensor": 0.02954578399658203,
|
| 150 |
+
"SpillPSum": 0.11643767356872559,
|
| 151 |
+
"SplitAPUnionSets": 0.07312703132629395,
|
| 152 |
+
"SplitAccGrp": 0.002663135528564453,
|
| 153 |
+
"StaticProfiler": 0.02257680892944336,
|
| 154 |
+
"StaticTransposeLocalTensor": 0.003572225570678711,
|
| 155 |
+
"SundaISel": 0.10315561294555664,
|
| 156 |
+
"TCTransform": 0.0025663375854492188,
|
| 157 |
+
"TensorInitialization": 0.00860285758972168,
|
| 158 |
+
"TensorOpSimplifier": 0.008630037307739258,
|
| 159 |
+
"TensorOpTransform": 0.028581619262695313,
|
| 160 |
+
"TensorizerLegalizationPass": 4.600000102072954e-05,
|
| 161 |
+
"TileCCOps": 0.00518488883972168,
|
| 162 |
+
"TilingProfiler": 0.023342609405517578,
|
| 163 |
+
"TransformConvOp": 0.008756637573242188,
|
| 164 |
+
"TritiumFusion": 0.13446974754333496,
|
| 165 |
+
"ValueNumbering": 0.003237485885620117,
|
| 166 |
+
"VectorizeDMA": 0.028183698654174805,
|
| 167 |
+
"VectorizeMatMult": 0.015199661254882813,
|
| 168 |
+
"VerifySupportedOps": 3.400000059627928e-05,
|
| 169 |
+
"WeightCoalescing": 0.01640915870666504,
|
| 170 |
+
"ZeroSizeTensorElimination": 0.0001671314239501953,
|
| 171 |
+
"algsimp": 0.0017099999822676182,
|
| 172 |
+
"batchnorm_expander": 3.400000059627928e-05,
|
| 173 |
+
"boundary-marker-removal": 1.2000000424450263e-05,
|
| 174 |
+
"call-inliner": 0.0002339999919058755,
|
| 175 |
+
"canonicalize-boundary-marker": 1.4999999621068127e-05,
|
| 176 |
+
"collective-stream-id-checker": 6.299999949987978e-05,
|
| 177 |
+
"comparison-expander": 0.0005050000036135316,
|
| 178 |
+
"computation-deduplicator": 5.100000271340832e-05,
|
| 179 |
+
"config-lowering": 0.0002690000110305846,
|
| 180 |
+
"constant-statistics": 0.000455000001238659,
|
| 181 |
+
"constant_folding": 0.00023099999816622585,
|
| 182 |
+
"cse": 3.7000001611886546e-05,
|
| 183 |
+
"dce": 6.000000212225132e-05,
|
| 184 |
+
"dot_decomposer": 0.0009510000236332417,
|
| 185 |
+
"dynamic-slice-transpose": 1.2999998943996616e-05,
|
| 186 |
+
"eliminate-redundant-compare": 0.00020500000391621143,
|
| 187 |
+
"emit-offloaded-dropout": 8.399999933317304e-05,
|
| 188 |
+
"flatten-call-graph": 0.0006050000083632767,
|
| 189 |
+
"fuse-send-recv": 5.199999577598646e-05,
|
| 190 |
+
"hilo-conditional-to-select": 1.4000000192027073e-05,
|
| 191 |
+
"hilo::LegalizeAlias": 1.2000000424450263e-05,
|
| 192 |
+
"hilo::NeuronInstCombine": 0.0001320000010309741,
|
| 193 |
+
"hilo::NeuronOpFusion": 9.099999442696571e-05,
|
| 194 |
+
"hilo::ReplaceTokenTypeWithU8Pass": 3.300000025774352e-05,
|
| 195 |
+
"hilo::ScheduleFusion": 5.999999757477781e-06,
|
| 196 |
+
"hilo::SixtyFourHack": 5.999999848427251e-05,
|
| 197 |
+
"hilo::VerifyAliasing": 3.999999989900971e-06,
|
| 198 |
+
"hlo-mac-count": 0.012813999317586422,
|
| 199 |
+
"instruction-histogram": 0.0005469999741762877,
|
| 200 |
+
"io-con-pipe-begin": 4.999999873689376e-06,
|
| 201 |
+
"io-con-pipe-end": 9.999999974752427e-07,
|
| 202 |
+
"io-layout-normalization": 0.0009079999872483313,
|
| 203 |
+
"io-statistics": 4.400000034365803e-05,
|
| 204 |
+
"legalize-ccops-for-tensorizer": 3.999999989900971e-06,
|
| 205 |
+
"legalize-compare": 1.1000000085914508e-05,
|
| 206 |
+
"lower-argminmax-custom-call": 9.999999747378752e-06,
|
| 207 |
+
"map-inline": 0.0007319999858736992,
|
| 208 |
+
"metadata-naming": 4.3000000005122274e-05,
|
| 209 |
+
"mlir::detail::OpToOpPassAdaptor": 7.100000220816582e-05,
|
| 210 |
+
"mlir::hlo::MhloToPyPenguin": 0.006075000390410423,
|
| 211 |
+
"mlir::mhlo::LowerComplexExtraPass": 0.0002460000105202198,
|
| 212 |
+
"mlir::mhlo::LowerComplexPass": 0.00047699996503069997,
|
| 213 |
+
"native-to-custom-softmax": 0.0005559999844990671,
|
| 214 |
+
"native-to-custom-softmax-dx": 0.0005599999567493796,
|
| 215 |
+
"neuron-hlo-verifier": 0.010796000249683857,
|
| 216 |
+
"operand_upcaster": 4.199999966658652e-05,
|
| 217 |
+
"opt-barrier-removal": 0.00039500001003034413,
|
| 218 |
+
"post-par-pipe-begin": 4.70000013592653e-05,
|
| 219 |
+
"post-par-pipe-end": 0.0,
|
| 220 |
+
"post-partition-simplification": 0.001361000002361834,
|
| 221 |
+
"pre-par-pipe-begin": 9.999999974752427e-07,
|
| 222 |
+
"pre-par-pipe-end": 0.0,
|
| 223 |
+
"pre-partition-simplification": 0.05799899995326996,
|
| 224 |
+
"replace-minimum-constant": 0.0003459999861661345,
|
| 225 |
+
"reshape-mover": 8.900000102585182e-05,
|
| 226 |
+
"simplify-concat": 0.00010900000052060932,
|
| 227 |
+
"simplify-while-loops": 5.900000178371556e-05,
|
| 228 |
+
"transform-variadic-reduce": 5.699999746866524e-05,
|
| 229 |
+
"tuple-simplifier": 0.00020900000527035445,
|
| 230 |
+
"unpack-nested-aws-ntwsr": 0.00026500000967644155,
|
| 231 |
+
"unroll-while-loop": 9.000000318337698e-06,
|
| 232 |
+
"zero_sized_hlo_elimination": 0.0007340000011026859
|
| 233 |
+
},
|
| 234 |
+
"hilo": {
|
| 235 |
+
"ConstantSize": 238229.0,
|
| 236 |
+
"HloInputCount": 371.0,
|
| 237 |
+
"HloMacCount": 6666190848.0,
|
| 238 |
+
"HloOutputCount": 57.0,
|
| 239 |
+
"IfmapSize": 3910913024.0,
|
| 240 |
+
"OfmapSize": 1879048192.0,
|
| 241 |
+
"OutputsReadFromCount": 0.0,
|
| 242 |
+
"PassthroughTensorsCount": 0.0,
|
| 243 |
+
"RedundantOutputCount": 0.0,
|
| 244 |
+
"Traffic": 864804480.0
|
| 245 |
+
},
|
| 246 |
+
"tensorizer": {
|
| 247 |
+
"DMATilingProfiler::TotalInstructionsAfterTiling": 20773.0,
|
| 248 |
+
"StaticProfiler::AifUb": 131.73849487304688,
|
| 249 |
+
"StaticProfiler::ArithmeticIntensityTensorizer": 127.46285247802734,
|
| 250 |
+
"StaticProfiler::AverageDmaLength": 2400.2490234375,
|
| 251 |
+
"StaticProfiler::DDRTransferBytes": 361746464.0,
|
| 252 |
+
"StaticProfiler::InternalTransferBytes": 320526112.0,
|
| 253 |
+
"StaticProfiler::LoadExpanded": 84060.0,
|
| 254 |
+
"StaticProfiler::StoreExpanded": 1898.0,
|
| 255 |
+
"StaticProfiler::TotalDMAExpanded": 85958.0,
|
| 256 |
+
"StaticProfiler::TotalDynamicInstancesCount": 25131.0,
|
| 257 |
+
"StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 24680.0,
|
| 258 |
+
"StaticProfiler::TotalLNCComm": 0.0,
|
| 259 |
+
"StaticProfiler::TotalLNCCommTransfer": 0.0,
|
| 260 |
+
"TilingProfiler::BatchnormInstructionsAfterTiling": 0.0,
|
| 261 |
+
"TilingProfiler::DmaInstructionsAfterTiling": 0.0,
|
| 262 |
+
"TilingProfiler::GenericInstructionsAfterTiling": 4.0,
|
| 263 |
+
"TilingProfiler::MatMultInstructionsAfterTiling": 10368.0,
|
| 264 |
+
"TilingProfiler::NumPfTransposes": 6.0,
|
| 265 |
+
"TilingProfiler::NumPfTransposesForIo": 1.0,
|
| 266 |
+
"TilingProfiler::NumPfTransposesForLocal": 1.0,
|
| 267 |
+
"TilingProfiler::NumPfTransposesForNonlocal": 4.0,
|
| 268 |
+
"TilingProfiler::PfTransposeInstructions": 10147.0,
|
| 269 |
+
"TilingProfiler::PfTransposeInstructionsForIo": 9504.0,
|
| 270 |
+
"TilingProfiler::PfTransposeInstructionsForLocal": 1.0,
|
| 271 |
+
"TilingProfiler::PfTransposeInstructionsForNonlocal": 642.0,
|
| 272 |
+
"TilingProfiler::ReduceInstructionsAfterTiling": 4.0,
|
| 273 |
+
"TilingProfiler::SimdInstructionsAfterTiling": 92.0,
|
| 274 |
+
"TilingProfiler::TotalInstructionsAfterTiling": 0.0,
|
| 275 |
+
"TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0,
|
| 276 |
+
"TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0,
|
| 277 |
+
"TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0,
|
| 278 |
+
"TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0,
|
| 279 |
+
"TransformConvOp::conv2d_column_packing": 0.0,
|
| 280 |
+
"TransformConvOp::conv2d_column_packing_1": 0.0,
|
| 281 |
+
"TransformConvOp::conv2d_column_packing_io10": 0.0,
|
| 282 |
+
"TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0
|
| 283 |
+
}
|
| 284 |
+
},
|
| 285 |
+
"all": {
|
| 286 |
+
"compiletime": {
|
| 287 |
+
"algsimp": 0.001560000004246831,
|
| 288 |
+
"call-inliner": 0.0002099999983329326,
|
| 289 |
+
"collective-stream-id-checker": 5.6000000768108293e-05,
|
| 290 |
+
"comparison-expander": 0.0004900000058114529,
|
| 291 |
+
"constant-statistics": 0.000455000001238659,
|
| 292 |
+
"constant_folding": 0.00020900000527035445,
|
| 293 |
+
"dce": 5.700000110664405e-05,
|
| 294 |
+
"dot_decomposer": 0.0009510000236332417,
|
| 295 |
+
"eliminate-redundant-compare": 0.00019500000053085387,
|
| 296 |
+
"flatten-call-graph": 0.0005799999926239252,
|
| 297 |
+
"hlo-mac-count": 0.00829899962991476,
|
| 298 |
+
"instruction-histogram": 0.0005469999741762877,
|
| 299 |
+
"io-con-pipe-begin": 4.999999873689376e-06,
|
| 300 |
+
"io-con-pipe-end": 9.999999974752427e-07,
|
| 301 |
+
"io-layout-normalization": 0.0009079999872483313,
|
| 302 |
+
"io-statistics": 4.400000034365803e-05,
|
| 303 |
+
"map-inline": 0.0007019999902695417,
|
| 304 |
+
"native-to-custom-softmax": 0.0005370000144466758,
|
| 305 |
+
"native-to-custom-softmax-dx": 0.00047599998652003706,
|
| 306 |
+
"neuron-hlo-verifier": 0.009705999866127968,
|
| 307 |
+
"opt-barrier-removal": 0.00039500001003034413,
|
| 308 |
+
"pre-par-pipe-begin": 9.999999974752427e-07,
|
| 309 |
+
"pre-par-pipe-end": 0.0,
|
| 310 |
+
"pre-partition-simplification": 0.05799899995326996,
|
| 311 |
+
"replace-minimum-constant": 0.0003279999946244061,
|
| 312 |
+
"reshape-mover": 7.999999797903001e-05,
|
| 313 |
+
"simplify-while-loops": 5.2999999752501026e-05,
|
| 314 |
+
"tuple-simplifier": 0.00019700000120792538,
|
| 315 |
+
"unpack-nested-aws-ntwsr": 0.00025400001322850585,
|
| 316 |
+
"unroll-while-loop": 9.000000318337698e-06,
|
| 317 |
+
"zero_sized_hlo_elimination": 0.0007340000011026859
|
| 318 |
+
}
|
| 319 |
+
},
|
| 320 |
+
"cumsum": {
|
| 321 |
+
"compiletime": {
|
| 322 |
+
"CoalesceCCOp": 0.00032806396484375,
|
| 323 |
+
"DMALocalityOpt": 0.00027751922607421875,
|
| 324 |
+
"DMAProfiler": 0.0011353492736816406,
|
| 325 |
+
"DataStreaming": 0.00044035911560058594,
|
| 326 |
+
"DoNothing": 0.0001888275146484375,
|
| 327 |
+
"ExpandISAMacro": 0.003916263580322266,
|
| 328 |
+
"FactorizeBlkDims": 0.001834869384765625,
|
| 329 |
+
"InferPSumTensor": 0.0010616779327392578,
|
| 330 |
+
"InferSharedMemLoc": 0.00044918060302734375,
|
| 331 |
+
"InsertCoreBarrier": 0.0004329681396484375,
|
| 332 |
+
"LateLegalizeInst": 0.002650022506713867,
|
| 333 |
+
"LateNeuronInstComb": 0.002856016159057617,
|
| 334 |
+
"LegalizeSundaAccess": 0.002493619918823242,
|
| 335 |
+
"LegalizeType": 0.0004024505615234375,
|
| 336 |
+
"LowerBroadcast": 0.00041794776916503906,
|
| 337 |
+
"LowerIntrinsics": 0.0003495216369628906,
|
| 338 |
+
"LowerTranspose": 0.00037598609924316406,
|
| 339 |
+
"NeuronInstComb": 0.0011763572692871094,
|
| 340 |
+
"NeuronLICM": 0.0014426708221435547,
|
| 341 |
+
"NeuronSimplifyPredicates": 0.012172937393188477,
|
| 342 |
+
"NeuronValueNumbering": 0.0006816387176513672,
|
| 343 |
+
"SFKVectorizer": 0.011650562286376953,
|
| 344 |
+
"SimpleAllReduceTiling": 0.00033855438232421875,
|
| 345 |
+
"SimplifyNeuronTensor": 0.0009646415710449219,
|
| 346 |
+
"SpillPSum": 0.0025339126586914063,
|
| 347 |
+
"WeightCoalescing": 0.0003387928009033203
|
| 348 |
+
}
|
| 349 |
+
},
|
| 350 |
+
"sg00": {
|
| 351 |
+
"compiletime": {
|
| 352 |
+
"CanonicalizeConv": 7.000000096013537e-06,
|
| 353 |
+
"CanonicalizeForTensorizer": 1.5999999959603883e-05,
|
| 354 |
+
"Canonicalizer": 0.00033000000985339284,
|
| 355 |
+
"HoistCompute": 1.9999999949504854e-06,
|
| 356 |
+
"IdentifyCrossPassTensors": 1.4000000192027073e-05,
|
| 357 |
+
"MemcastMotion": 9.999999747378752e-06,
|
| 358 |
+
"PenguinizeFunctions": 1.4999999621068127e-05,
|
| 359 |
+
"PruneFunctions": 1.4999999621068127e-05,
|
| 360 |
+
"RemoveOptimizationBarriers": 3.300000025774352e-05,
|
| 361 |
+
"ScatterMotion": 2.2000000171829015e-05,
|
| 362 |
+
"TensorizerLegalizationPass": 2.8000000384054147e-05,
|
| 363 |
+
"VerifySupportedOps": 1.2000000424450263e-05,
|
| 364 |
+
"algsimp": 5.199999941396527e-05,
|
| 365 |
+
"batchnorm_expander": 1.1000000085914508e-05,
|
| 366 |
+
"boundary-marker-removal": 3.999999989900971e-06,
|
| 367 |
+
"call-inliner": 7.000000096013537e-06,
|
| 368 |
+
"canonicalize-boundary-marker": 4.999999873689376e-06,
|
| 369 |
+
"collective-stream-id-checker": 1.9999999949504854e-06,
|
| 370 |
+
"comparison-expander": 4.999999873689376e-06,
|
| 371 |
+
"computation-deduplicator": 1.5999999959603883e-05,
|
| 372 |
+
"config-lowering": 0.0001289999927394092,
|
| 373 |
+
"constant_folding": 7.000000096013537e-06,
|
| 374 |
+
"cse": 1.2000000424450263e-05,
|
| 375 |
+
"dce": 9.999999974752427e-07,
|
| 376 |
+
"dynamic-slice-transpose": 4.999999873689376e-06,
|
| 377 |
+
"eliminate-redundant-compare": 3.000000106112566e-06,
|
| 378 |
+
"emit-offloaded-dropout": 4.5000000682193786e-05,
|
| 379 |
+
"flatten-call-graph": 7.999999979801942e-06,
|
| 380 |
+
"fuse-send-recv": 1.8000000636675395e-05,
|
| 381 |
+
"hilo-conditional-to-select": 3.999999989900971e-06,
|
| 382 |
+
"hilo::LegalizeAlias": 6.000000212225132e-06,
|
| 383 |
+
"hilo::NeuronInstCombine": 6.70000008540228e-05,
|
| 384 |
+
"hilo::NeuronOpFusion": 4.099999932805076e-05,
|
| 385 |
+
"hilo::ReplaceTokenTypeWithU8Pass": 1.2999999853491317e-05,
|
| 386 |
+
"hilo::ScheduleFusion": 9.999999974752427e-07,
|
| 387 |
+
"hilo::SixtyFourHack": 1.2000000424450263e-05,
|
| 388 |
+
"hilo::VerifyAliasing": 1.9999999949504854e-06,
|
| 389 |
+
"hlo-mac-count": 1.8999999156221747e-05,
|
| 390 |
+
"legalize-ccops-for-tensorizer": 1.9999999949504854e-06,
|
| 391 |
+
"legalize-compare": 3.999999989900971e-06,
|
| 392 |
+
"lower-argminmax-custom-call": 3.000000106112566e-06,
|
| 393 |
+
"map-inline": 9.000000318337698e-06,
|
| 394 |
+
"metadata-naming": 1.2999999853491317e-05,
|
| 395 |
+
"mlir::detail::OpToOpPassAdaptor": 2.099999983329326e-05,
|
| 396 |
+
"mlir::hlo::MhloToPyPenguin": 0.0009730000165291131,
|
| 397 |
+
"mlir::mhlo::LowerComplexExtraPass": 8.399999933317304e-05,
|
| 398 |
+
"mlir::mhlo::LowerComplexPass": 0.000195999993593432,
|
| 399 |
+
"native-to-custom-softmax": 9.000000318337698e-06,
|
| 400 |
+
"native-to-custom-softmax-dx": 5.500000042957254e-05,
|
| 401 |
+
"neuron-hlo-verifier": 0.0003929999948013574,
|
| 402 |
+
"operand_upcaster": 1.700000029813964e-05,
|
| 403 |
+
"post-par-pipe-begin": 4.400000034365803e-05,
|
| 404 |
+
"post-par-pipe-end": 0.0,
|
| 405 |
+
"post-partition-simplification": 0.00047500000800937414,
|
| 406 |
+
"replace-minimum-constant": 6.000000212225132e-06,
|
| 407 |
+
"reshape-mover": 3.000000106112566e-06,
|
| 408 |
+
"simplify-concat": 3.400000059627928e-05,
|
| 409 |
+
"simplify-while-loops": 1.9999999949504854e-06,
|
| 410 |
+
"transform-variadic-reduce": 7.999999979801942e-06,
|
| 411 |
+
"tuple-simplifier": 3.999999989900971e-06,
|
| 412 |
+
"unpack-nested-aws-ntwsr": 3.999999989900971e-06,
|
| 413 |
+
"unroll-while-loop": 0.0
|
| 414 |
+
},
|
| 415 |
+
"hilo": {
|
| 416 |
+
"ArithmeticIntensity": 4.265669345855713,
|
| 417 |
+
"ConstantSize": 238229.0,
|
| 418 |
+
"HloInputCount": 371.0,
|
| 419 |
+
"HloMacCount": 838860800.0,
|
| 420 |
+
"HloOutputCount": 57.0,
|
| 421 |
+
"IfmapSize": 3910913024.0,
|
| 422 |
+
"OfmapSize": 1879048192.0,
|
| 423 |
+
"OutputsReadFromCount": 0.0,
|
| 424 |
+
"PassthroughTensorsCount": 0.0,
|
| 425 |
+
"RedundantOutputCount": 0.0,
|
| 426 |
+
"Traffic": 393307936.0
|
| 427 |
+
}
|
| 428 |
+
},
|
| 429 |
+
"sg0000": {
|
| 430 |
+
"compiletime": {
|
| 431 |
+
"AGOrderingAnalysisPass": 0.04803347587585449,
|
| 432 |
+
"AffinePredicateResolution": 0.0014185905456542969,
|
| 433 |
+
"AliasDependencyElimination": 0.0002288818359375,
|
| 434 |
+
"AliasDependencyInduction": 0.023572683334350586,
|
| 435 |
+
"AliasDependencyReset": 0.050307273864746094,
|
| 436 |
+
"BFComputeCutting": 0.0020284652709960938,
|
| 437 |
+
"BirCodeGenLoop": 0.06627583503723145,
|
| 438 |
+
"CCOpFusion": 0.030767440795898438,
|
| 439 |
+
"CanonicalizeDAGForPGTiling": 0.005156278610229492,
|
| 440 |
+
"CanonicalizeIR": 0.0024123191833496094,
|
| 441 |
+
"CoalesceCCOp": 0.017067909240722656,
|
| 442 |
+
"CommuteConcat": 0.0011420249938964844,
|
| 443 |
+
"DMALocalityOpt": 0.0021338462829589844,
|
| 444 |
+
"DMAProfiler": 0.015033483505249023,
|
| 445 |
+
"DMATilingProfiler": 0.006984710693359375,
|
| 446 |
+
"DataLocalityOpt": 0.3054013252258301,
|
| 447 |
+
"DataStreaming": 0.014647245407104492,
|
| 448 |
+
"DeConcat": 0.005982398986816406,
|
| 449 |
+
"DeadCodeElimination": 0.0018534660339355469,
|
| 450 |
+
"DeadStoreElimination": 0.04532670974731445,
|
| 451 |
+
"DelinearIndices": 0.028018474578857422,
|
| 452 |
+
"Delinearization": 0.0051403045654296875,
|
| 453 |
+
"DelinearizeSPMD": 0.03557705879211426,
|
| 454 |
+
"DoNothing": 0.00012373924255371094,
|
| 455 |
+
"DramToDramTranspose": 0.030788660049438477,
|
| 456 |
+
"DumpGraphAndMetadata": 0.008297920227050781,
|
| 457 |
+
"EliminateDivs": 0.003348112106323242,
|
| 458 |
+
"ExpandBatchNorm": 0.002971172332763672,
|
| 459 |
+
"ExpandISAMacro": 0.007505178451538086,
|
| 460 |
+
"FactorizeBlkDims": 0.052065372467041016,
|
| 461 |
+
"FactorizeThreadAxesInFreeDims": 0.006781101226806641,
|
| 462 |
+
"FlattenMacroLoop": 0.006749868392944336,
|
| 463 |
+
"GenericAccessSimplifier": 0.0015370845794677734,
|
| 464 |
+
"InferInitValue": 0.13031220436096191,
|
| 465 |
+
"InferIntrinsicOnCC": 0.01256871223449707,
|
| 466 |
+
"InferNeuronTensor": 0.07101988792419434,
|
| 467 |
+
"InferNonlocalTensors": 0.0933828353881836,
|
| 468 |
+
"InferPSumTensor": 0.09560966491699219,
|
| 469 |
+
"InferShardAxis": 0.312000036239624,
|
| 470 |
+
"InferSharedMemLoc": 0.006642341613769531,
|
| 471 |
+
"InlineNativeKernels": 0.0033979415893554688,
|
| 472 |
+
"InsertCoreBarrier": 0.008008241653442383,
|
| 473 |
+
"InsertIOTransposes": 0.018876314163208008,
|
| 474 |
+
"InsertImplicitShardAxisBeforeISel": 0.016681194305419922,
|
| 475 |
+
"InsertLocalTransposes": 0.009229898452758789,
|
| 476 |
+
"InsertOffloadedTransposes": 0.05370330810546875,
|
| 477 |
+
"LICM": 0.007573604583740234,
|
| 478 |
+
"LateLegalizeInst": 0.01623988151550293,
|
| 479 |
+
"LateLegalizePostSplit": 0.007147073745727539,
|
| 480 |
+
"LateLowerReshapeOp": 0.0011415481567382813,
|
| 481 |
+
"LateLowerTensorOp": 0.0066013336181640625,
|
| 482 |
+
"LateNeuronInstComb": 0.12343692779541016,
|
| 483 |
+
"LayoutPreprocessing": 0.02958393096923828,
|
| 484 |
+
"LayoutPreprocessingAndAnalysis": 0.14548635482788086,
|
| 485 |
+
"LayoutRequirementAnalysis": 0.007357358932495117,
|
| 486 |
+
"LegalizeCCOpLayout": 0.0018928050994873047,
|
| 487 |
+
"LegalizeOpLevelAlias": 0.001081228256225586,
|
| 488 |
+
"LegalizePartitionReduce": 0.003218412399291992,
|
| 489 |
+
"LegalizeSundaAccess": 0.08743572235107422,
|
| 490 |
+
"LegalizeSundaMacro": 0.04705023765563965,
|
| 491 |
+
"LegalizeType": 0.009063720703125,
|
| 492 |
+
"LocalLayoutOpt": 0.017424583435058594,
|
| 493 |
+
"LoopFusion": 0.006888866424560547,
|
| 494 |
+
"LoopSplitting": 0.0018482208251953125,
|
| 495 |
+
"LowerBroadcast": 0.00490117073059082,
|
| 496 |
+
"LowerCCOpBlockAxis": 0.004808902740478516,
|
| 497 |
+
"LowerComplexBroadcast": 0.007742166519165039,
|
| 498 |
+
"LowerIntrinsics": 0.04466986656188965,
|
| 499 |
+
"LowerShardAxis": 0.008558988571166992,
|
| 500 |
+
"LowerTensorOp": 0.011698722839355469,
|
| 501 |
+
"LowerToSendRecv": 0.01171255111694336,
|
| 502 |
+
"LowerTranspose": 0.012961864471435547,
|
| 503 |
+
"MacroGeneration": 0.07335543632507324,
|
| 504 |
+
"MaskPropagation": 0.004875659942626953,
|
| 505 |
+
"MemcpyElimination": 0.19086575508117676,
|
| 506 |
+
"MutateDataType": 0.002115011215209961,
|
| 507 |
+
"NeuronAliasDependencyInduction": 0.0007119178771972656,
|
| 508 |
+
"NeuronAliasDependencyReset": 0.0555264949798584,
|
| 509 |
+
"NeuronInstComb": 0.03685903549194336,
|
| 510 |
+
"NeuronLICM": 0.02129840850830078,
|
| 511 |
+
"NeuronLoopFusion": 0.04936552047729492,
|
| 512 |
+
"NeuronLoopInterchange": 0.008442163467407227,
|
| 513 |
+
"NeuronSimplifier": 0.020423412322998047,
|
| 514 |
+
"NeuronSimplifyPredicates": 0.013469934463500977,
|
| 515 |
+
"NeuronValueNumbering": 0.011552095413208008,
|
| 516 |
+
"OptimizeAliasedCopyChain": 0.0006189346313476563,
|
| 517 |
+
"OptimizeNKIKernels": 0.0030050277709960938,
|
| 518 |
+
"PAGLayoutOpt": 0.4311056137084961,
|
| 519 |
+
"PComputeCutting": 0.008741617202758789,
|
| 520 |
+
"PGLayoutTilingPipeline": 1.7890496253967285,
|
| 521 |
+
"PGTiling": 0.33126235008239746,
|
| 522 |
+
"PadElimination": 0.0006849765777587891,
|
| 523 |
+
"ParAxesAnnotation": 0.3421931266784668,
|
| 524 |
+
"PartialLoopFusion": 0.05652737617492676,
|
| 525 |
+
"PartialSimdFusion": 0.04400372505187988,
|
| 526 |
+
"PerfectLoopNest": 0.007196664810180664,
|
| 527 |
+
"RecognizeOpIdiom": 0.003924369812011719,
|
| 528 |
+
"Recompute": 0.0004436969757080078,
|
| 529 |
+
"RelaxPredicates": 0.006342649459838867,
|
| 530 |
+
"Rematerialization": 0.006484508514404297,
|
| 531 |
+
"RemoveShardedPartitionAxes": 0.03604388236999512,
|
| 532 |
+
"ReshapeWeights": 0.002611398696899414,
|
| 533 |
+
"ResolveAccessConflict": 0.01564621925354004,
|
| 534 |
+
"ResolveComplicatePredicates": 0.0013320446014404297,
|
| 535 |
+
"RewriteReplicationMatmul": 0.008888483047485352,
|
| 536 |
+
"RewriteWeights": 0.005518674850463867,
|
| 537 |
+
"SFKVectorizer": 0.23942208290100098,
|
| 538 |
+
"ShardingPropagationAnalysis": 0.06231117248535156,
|
| 539 |
+
"SimpleAllReduceTiling": 0.008965253829956055,
|
| 540 |
+
"Simplifier": 0.009177446365356445,
|
| 541 |
+
"SimplifyMacroPredicates": 0.03521132469177246,
|
| 542 |
+
"SimplifyNeuronTensor": 0.022907257080078125,
|
| 543 |
+
"SimplifySlice": 0.001043081283569336,
|
| 544 |
+
"SimplifyTensor": 0.028610706329345703,
|
| 545 |
+
"SpillPSum": 0.041993141174316406,
|
| 546 |
+
"SplitAPUnionSets": 0.06584334373474121,
|
| 547 |
+
"SplitAccGrp": 0.005825042724609375,
|
| 548 |
+
"StaticProfiler": 0.013434648513793945,
|
| 549 |
+
"StaticTransposeLocalTensor": 0.008102178573608398,
|
| 550 |
+
"SundaISel": 0.12313151359558105,
|
| 551 |
+
"TCTransform": 0.0010597705841064453,
|
| 552 |
+
"TensorInitialization": 0.024387359619140625,
|
| 553 |
+
"TensorOpSimplifier": 0.006582498550415039,
|
| 554 |
+
"TensorOpTransform": 0.06252408027648926,
|
| 555 |
+
"TileCCOps": 0.016498565673828125,
|
| 556 |
+
"TilingProfiler": 0.06818985939025879,
|
| 557 |
+
"TransformConvOp": 0.0028336048126220703,
|
| 558 |
+
"TritiumFusion": 0.01378488540649414,
|
| 559 |
+
"ValueNumbering": 0.0024378299713134766,
|
| 560 |
+
"VectorizeDMA": 0.042115211486816406,
|
| 561 |
+
"VectorizeMatMult": 0.008977413177490234,
|
| 562 |
+
"WeightCoalescing": 0.005861759185791016,
|
| 563 |
+
"ZeroSizeTensorElimination": 0.00017881393432617188
|
| 564 |
+
},
|
| 565 |
+
"tensorizer": {
|
| 566 |
+
"DMATilingProfiler::TotalInstructionsAfterTiling": 416.0,
|
| 567 |
+
"StaticProfiler::AifUb": 5.140732288360596,
|
| 568 |
+
"StaticProfiler::ArithmeticIntensityTensorizer": 143.96510314941406,
|
| 569 |
+
"StaticProfiler::AverageDmaLength": 2013.53125,
|
| 570 |
+
"StaticProfiler::AverageFractalPeUtilization": 99.74824523925781,
|
| 571 |
+
"StaticProfiler::AveragePartitionUtilization": 99.1868667602539,
|
| 572 |
+
"StaticProfiler::AveragePeUtilization": 99.49378204345703,
|
| 573 |
+
"StaticProfiler::DDRTransferBytes": 16395014.0,
|
| 574 |
+
"StaticProfiler::InternalTransferBytes": 10682368.0,
|
| 575 |
+
"StaticProfiler::LoadExpanded": 3459.0,
|
| 576 |
+
"StaticProfiler::LocalizationEfficiency": 2800.478271484375,
|
| 577 |
+
"StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 3271.21728515625,
|
| 578 |
+
"StaticProfiler::StoreExpanded": 1537.0,
|
| 579 |
+
"StaticProfiler::TotalDMAExpanded": 4996.0,
|
| 580 |
+
"StaticProfiler::TotalDynamicInstancesCount": 801.0,
|
| 581 |
+
"StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 800.0,
|
| 582 |
+
"StaticProfiler::TotalLNCComm": 0.0,
|
| 583 |
+
"StaticProfiler::TotalLNCCommTransfer": 0.0,
|
| 584 |
+
"TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0,
|
| 585 |
+
"TilingProfiler::AveragePeUtilizationAfterTiling": 0.0,
|
| 586 |
+
"TilingProfiler::BatchnormInstructionsAfterTiling": 0.0,
|
| 587 |
+
"TilingProfiler::DmaInstructionsAfterTiling": 0.0,
|
| 588 |
+
"TilingProfiler::GenericInstructionsAfterTiling": 10.0,
|
| 589 |
+
"TilingProfiler::MatMultInstructionsAfterTiling": 253.0,
|
| 590 |
+
"TilingProfiler::NumPfTransposes": 7.0,
|
| 591 |
+
"TilingProfiler::NumPfTransposesForIo": 0.0,
|
| 592 |
+
"TilingProfiler::NumPfTransposesForLocal": 5.0,
|
| 593 |
+
"TilingProfiler::NumPfTransposesForNonlocal": 2.0,
|
| 594 |
+
"TilingProfiler::PfTransposeInstructions": 56.0,
|
| 595 |
+
"TilingProfiler::PfTransposeInstructionsForIo": 0.0,
|
| 596 |
+
"TilingProfiler::PfTransposeInstructionsForLocal": 32.0,
|
| 597 |
+
"TilingProfiler::PfTransposeInstructionsForNonlocal": 24.0,
|
| 598 |
+
"TilingProfiler::ReduceInstructionsAfterTiling": 0.0,
|
| 599 |
+
"TilingProfiler::SimdInstructionsAfterTiling": 47.0,
|
| 600 |
+
"TilingProfiler::TotalInstructionsAfterTiling": 0.0,
|
| 601 |
+
"TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0,
|
| 602 |
+
"TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0,
|
| 603 |
+
"TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0,
|
| 604 |
+
"TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0,
|
| 605 |
+
"TransformConvOp::conv2d_column_packing": 0.0,
|
| 606 |
+
"TransformConvOp::conv2d_column_packing_1": 0.0,
|
| 607 |
+
"TransformConvOp::conv2d_column_packing_io10": 0.0,
|
| 608 |
+
"TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0
|
| 609 |
+
}
|
| 610 |
+
},
|
| 611 |
+
"sg0001": {
|
| 612 |
+
"compiletime": {
|
| 613 |
+
"AGOrderingAnalysisPass": 0.19573044776916504,
|
| 614 |
+
"AffinePredicateResolution": 0.0011768341064453125,
|
| 615 |
+
"AliasDependencyElimination": 0.00014972686767578125,
|
| 616 |
+
"AliasDependencyInduction": 0.02374124526977539,
|
| 617 |
+
"AliasDependencyReset": 0.05898928642272949,
|
| 618 |
+
"BFComputeCutting": 0.0019648075103759766,
|
| 619 |
+
"BirCodeGenLoop": 0.04745078086853027,
|
| 620 |
+
"CCOpFusion": 0.034403324127197266,
|
| 621 |
+
"CanonicalizeDAGForPGTiling": 0.013227224349975586,
|
| 622 |
+
"CanonicalizeIR": 0.0016665458679199219,
|
| 623 |
+
"CoalesceCCOp": 0.008426904678344727,
|
| 624 |
+
"CommuteConcat": 0.0011937618255615234,
|
| 625 |
+
"DMALocalityOpt": 0.0020418167114257813,
|
| 626 |
+
"DMAProfiler": 0.0212709903717041,
|
| 627 |
+
"DMATilingProfiler": 0.007970333099365234,
|
| 628 |
+
"DataLocalityOpt": 0.31763386726379395,
|
| 629 |
+
"DataStreaming": 0.013140678405761719,
|
| 630 |
+
"DeConcat": 0.006093025207519531,
|
| 631 |
+
"DeadCodeElimination": 0.0022492408752441406,
|
| 632 |
+
"DeadStoreElimination": 0.03447914123535156,
|
| 633 |
+
"DelinearIndices": 0.017621278762817383,
|
| 634 |
+
"Delinearization": 0.006613731384277344,
|
| 635 |
+
"DelinearizeSPMD": 0.036255598068237305,
|
| 636 |
+
"DoNothing": 9.298324584960938e-05,
|
| 637 |
+
"DramToDramTranspose": 0.011357545852661133,
|
| 638 |
+
"DumpGraphAndMetadata": 0.0038836002349853516,
|
| 639 |
+
"EliminateDivs": 0.007913589477539063,
|
| 640 |
+
"ExpandBatchNorm": 0.0027163028717041016,
|
| 641 |
+
"ExpandISAMacro": 0.006444692611694336,
|
| 642 |
+
"FactorizeBlkDims": 0.023404359817504883,
|
| 643 |
+
"FactorizeThreadAxesInFreeDims": 0.011568069458007813,
|
| 644 |
+
"FlattenMacroLoop": 0.012357473373413086,
|
| 645 |
+
"GenericAccessSimplifier": 0.0020608901977539063,
|
| 646 |
+
"InferInitValue": 0.10583114624023438,
|
| 647 |
+
"InferIntrinsicOnCC": 0.00994729995727539,
|
| 648 |
+
"InferNeuronTensor": 0.04976606369018555,
|
| 649 |
+
"InferNonlocalTensors": 0.04819130897521973,
|
| 650 |
+
"InferPSumTensor": 0.0679934024810791,
|
| 651 |
+
"InferShardAxis": 0.6268763542175293,
|
| 652 |
+
"InferSharedMemLoc": 0.005129814147949219,
|
| 653 |
+
"InlineNativeKernels": 0.009308338165283203,
|
| 654 |
+
"InsertCoreBarrier": 0.00969243049621582,
|
| 655 |
+
"InsertIOTransposes": 0.03561210632324219,
|
| 656 |
+
"InsertImplicitShardAxisBeforeISel": 0.017783164978027344,
|
| 657 |
+
"InsertLocalTransposes": 0.012435436248779297,
|
| 658 |
+
"InsertOffloadedTransposes": 0.008218526840209961,
|
| 659 |
+
"LICM": 0.011756420135498047,
|
| 660 |
+
"LateLegalizeInst": 0.012684106826782227,
|
| 661 |
+
"LateLegalizePostSplit": 0.0054225921630859375,
|
| 662 |
+
"LateLowerReshapeOp": 0.002172231674194336,
|
| 663 |
+
"LateLowerTensorOp": 0.003939151763916016,
|
| 664 |
+
"LateNeuronInstComb": 0.07796549797058105,
|
| 665 |
+
"LayoutPreprocessing": 0.09417939186096191,
|
| 666 |
+
"LayoutPreprocessingAndAnalysis": 0.15397191047668457,
|
| 667 |
+
"LayoutRequirementAnalysis": 0.03167152404785156,
|
| 668 |
+
"LegalizeCCOpLayout": 0.001916646957397461,
|
| 669 |
+
"LegalizeOpLevelAlias": 0.00103759765625,
|
| 670 |
+
"LegalizePartitionReduce": 0.002568960189819336,
|
| 671 |
+
"LegalizeSundaAccess": 0.03490424156188965,
|
| 672 |
+
"LegalizeSundaMacro": 0.04486250877380371,
|
| 673 |
+
"LegalizeType": 0.010438203811645508,
|
| 674 |
+
"LocalLayoutOpt": 0.037950992584228516,
|
| 675 |
+
"LoopFusion": 0.00687098503112793,
|
| 676 |
+
"LoopSplitting": 0.002494335174560547,
|
| 677 |
+
"LowerBroadcast": 0.0028448104858398438,
|
| 678 |
+
"LowerCCOpBlockAxis": 0.016790151596069336,
|
| 679 |
+
"LowerComplexBroadcast": 0.003789663314819336,
|
| 680 |
+
"LowerIntrinsics": 0.06158947944641113,
|
| 681 |
+
"LowerShardAxis": 0.009115934371948242,
|
| 682 |
+
"LowerTensorOp": 0.011396646499633789,
|
| 683 |
+
"LowerToSendRecv": 0.00603795051574707,
|
| 684 |
+
"LowerTranspose": 0.030293703079223633,
|
| 685 |
+
"MacroGeneration": 0.14122748374938965,
|
| 686 |
+
"MaskPropagation": 0.007950544357299805,
|
| 687 |
+
"MemcpyElimination": 0.18889641761779785,
|
| 688 |
+
"MutateDataType": 0.0014033317565917969,
|
| 689 |
+
"NeuronAliasDependencyInduction": 0.0007326602935791016,
|
| 690 |
+
"NeuronAliasDependencyReset": 0.025636672973632813,
|
| 691 |
+
"NeuronInstComb": 0.0452880859375,
|
| 692 |
+
"NeuronLICM": 0.027920246124267578,
|
| 693 |
+
"NeuronLoopFusion": 0.07481861114501953,
|
| 694 |
+
"NeuronLoopInterchange": 0.004810810089111328,
|
| 695 |
+
"NeuronSimplifier": 0.027257442474365234,
|
| 696 |
+
"NeuronSimplifyPredicates": 0.011795282363891602,
|
| 697 |
+
"NeuronValueNumbering": 0.013232946395874023,
|
| 698 |
+
"OptimizeAliasedCopyChain": 0.000640869140625,
|
| 699 |
+
"OptimizeNKIKernels": 0.007096529006958008,
|
| 700 |
+
"PAGLayoutOpt": 0.25133657455444336,
|
| 701 |
+
"PComputeCutting": 0.02008199691772461,
|
| 702 |
+
"PGLayoutTilingPipeline": 2.1073567867279053,
|
| 703 |
+
"PGTiling": 0.5283112525939941,
|
| 704 |
+
"PadElimination": 0.0005664825439453125,
|
| 705 |
+
"ParAxesAnnotation": 0.16274571418762207,
|
| 706 |
+
"PartialLoopFusion": 0.07154703140258789,
|
| 707 |
+
"PartialSimdFusion": 0.05425691604614258,
|
| 708 |
+
"PerfectLoopNest": 0.007505655288696289,
|
| 709 |
+
"RecognizeOpIdiom": 0.004193305969238281,
|
| 710 |
+
"Recompute": 0.0005002021789550781,
|
| 711 |
+
"RelaxPredicates": 0.0031478404998779297,
|
| 712 |
+
"Rematerialization": 0.002758502960205078,
|
| 713 |
+
"RemoveShardedPartitionAxes": 0.05587267875671387,
|
| 714 |
+
"ReshapeWeights": 0.0015969276428222656,
|
| 715 |
+
"ResolveAccessConflict": 0.021365642547607422,
|
| 716 |
+
"ResolveComplicatePredicates": 0.0011401176452636719,
|
| 717 |
+
"RewriteReplicationMatmul": 0.0025501251220703125,
|
| 718 |
+
"RewriteWeights": 0.014093399047851563,
|
| 719 |
+
"SFKVectorizer": 0.51774001121521,
|
| 720 |
+
"ShardingPropagationAnalysis": 0.030755996704101563,
|
| 721 |
+
"SimpleAllReduceTiling": 0.003780364990234375,
|
| 722 |
+
"Simplifier": 0.006270885467529297,
|
| 723 |
+
"SimplifyMacroPredicates": 0.01894402503967285,
|
| 724 |
+
"SimplifyNeuronTensor": 0.036655426025390625,
|
| 725 |
+
"SimplifySlice": 0.0019352436065673828,
|
| 726 |
+
"SimplifyTensor": 0.033560752868652344,
|
| 727 |
+
"SpillPSum": 0.03554582595825195,
|
| 728 |
+
"SplitAPUnionSets": 0.039057016372680664,
|
| 729 |
+
"SplitAccGrp": 0.002908468246459961,
|
| 730 |
+
"StaticProfiler": 0.009857654571533203,
|
| 731 |
+
"StaticTransposeLocalTensor": 0.014261007308959961,
|
| 732 |
+
"SundaISel": 0.07885026931762695,
|
| 733 |
+
"TCTransform": 0.0012857913970947266,
|
| 734 |
+
"TensorInitialization": 0.011929512023925781,
|
| 735 |
+
"TensorOpSimplifier": 0.007134199142456055,
|
| 736 |
+
"TensorOpTransform": 0.05220603942871094,
|
| 737 |
+
"TileCCOps": 0.006574392318725586,
|
| 738 |
+
"TilingProfiler": 0.03860926628112793,
|
| 739 |
+
"TransformConvOp": 0.002733469009399414,
|
| 740 |
+
"TritiumFusion": 0.08646178245544434,
|
| 741 |
+
"ValueNumbering": 0.003155946731567383,
|
| 742 |
+
"VectorizeDMA": 0.029859304428100586,
|
| 743 |
+
"VectorizeMatMult": 0.011672019958496094,
|
| 744 |
+
"WeightCoalescing": 0.004624366760253906,
|
| 745 |
+
"ZeroSizeTensorElimination": 0.0002124309539794922
|
| 746 |
+
},
|
| 747 |
+
"tensorizer": {
|
| 748 |
+
"DMATilingProfiler::TotalInstructionsAfterTiling": 1427.0,
|
| 749 |
+
"StaticProfiler::AifUb": 40.19935607910156,
|
| 750 |
+
"StaticProfiler::ArithmeticIntensityTensorizer": 134.3648223876953,
|
| 751 |
+
"StaticProfiler::AverageDmaLength": 4238.58251953125,
|
| 752 |
+
"StaticProfiler::AverageFractalPeUtilization": 100.0,
|
| 753 |
+
"StaticProfiler::AveragePartitionUtilization": 99.61003112792969,
|
| 754 |
+
"StaticProfiler::AveragePeUtilization": 100.0,
|
| 755 |
+
"StaticProfiler::DDRTransferBytes": 55879176.0,
|
| 756 |
+
"StaticProfiler::InternalTransferBytes": 9895936.0,
|
| 757 |
+
"StaticProfiler::LoadExpanded": 9729.0,
|
| 758 |
+
"StaticProfiler::LocalizationEfficiency": 334.2462158203125,
|
| 759 |
+
"StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 357.74188232421875,
|
| 760 |
+
"StaticProfiler::StoreExpanded": 769.0,
|
| 761 |
+
"StaticProfiler::TotalDMAExpanded": 10498.0,
|
| 762 |
+
"StaticProfiler::TotalDynamicInstancesCount": 1799.0,
|
| 763 |
+
"StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 1799.0,
|
| 764 |
+
"StaticProfiler::TotalLNCComm": 0.0,
|
| 765 |
+
"StaticProfiler::TotalLNCCommTransfer": 0.0,
|
| 766 |
+
"TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0,
|
| 767 |
+
"TilingProfiler::AveragePeUtilizationAfterTiling": 0.0,
|
| 768 |
+
"TilingProfiler::BatchnormInstructionsAfterTiling": 0.0,
|
| 769 |
+
"TilingProfiler::DmaInstructionsAfterTiling": 0.0,
|
| 770 |
+
"TilingProfiler::GenericInstructionsAfterTiling": 8.0,
|
| 771 |
+
"TilingProfiler::MatMultInstructionsAfterTiling": 1116.0,
|
| 772 |
+
"TilingProfiler::NumPfTransposes": 8.0,
|
| 773 |
+
"TilingProfiler::NumPfTransposesForIo": 3.0,
|
| 774 |
+
"TilingProfiler::NumPfTransposesForLocal": 3.0,
|
| 775 |
+
"TilingProfiler::NumPfTransposesForNonlocal": 2.0,
|
| 776 |
+
"TilingProfiler::PfTransposeInstructions": 66.0,
|
| 777 |
+
"TilingProfiler::PfTransposeInstructionsForIo": 18.0,
|
| 778 |
+
"TilingProfiler::PfTransposeInstructionsForLocal": 16.0,
|
| 779 |
+
"TilingProfiler::PfTransposeInstructionsForNonlocal": 32.0,
|
| 780 |
+
"TilingProfiler::ReduceInstructionsAfterTiling": 0.0,
|
| 781 |
+
"TilingProfiler::SimdInstructionsAfterTiling": 87.0,
|
| 782 |
+
"TilingProfiler::TotalInstructionsAfterTiling": 0.0,
|
| 783 |
+
"TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0,
|
| 784 |
+
"TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0,
|
| 785 |
+
"TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0,
|
| 786 |
+
"TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0,
|
| 787 |
+
"TransformConvOp::conv2d_column_packing": 0.0,
|
| 788 |
+
"TransformConvOp::conv2d_column_packing_1": 0.0,
|
| 789 |
+
"TransformConvOp::conv2d_column_packing_io10": 0.0,
|
| 790 |
+
"TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0
|
| 791 |
+
}
|
| 792 |
+
},
|
| 793 |
+
"sg0002": {
|
| 794 |
+
"compiletime": {
|
| 795 |
+
"AGOrderingAnalysisPass": 0.037471771240234375,
|
| 796 |
+
"AffinePredicateResolution": 0.0048100948333740234,
|
| 797 |
+
"AliasDependencyElimination": 0.0002529621124267578,
|
| 798 |
+
"AliasDependencyInduction": 0.005568504333496094,
|
| 799 |
+
"AliasDependencyReset": 0.11161017417907715,
|
| 800 |
+
"BFComputeCutting": 0.0024290084838867188,
|
| 801 |
+
"BirCodeGenLoop": 0.32352304458618164,
|
| 802 |
+
"CCOpFusion": 0.033486366271972656,
|
| 803 |
+
"CanonicalizeDAGForPGTiling": 0.004197120666503906,
|
| 804 |
+
"CanonicalizeIR": 0.0025298595428466797,
|
| 805 |
+
"CoalesceCCOp": 0.007080078125,
|
| 806 |
+
"CommuteConcat": 0.0018744468688964844,
|
| 807 |
+
"DMALocalityOpt": 0.0021386146545410156,
|
| 808 |
+
"DMAProfiler": 0.01854729652404785,
|
| 809 |
+
"DMATilingProfiler": 0.015254497528076172,
|
| 810 |
+
"DataLocalityOpt": 0.1120154857635498,
|
| 811 |
+
"DataStreaming": 0.007681369781494141,
|
| 812 |
+
"DeConcat": 0.0022406578063964844,
|
| 813 |
+
"DeadCodeElimination": 0.0021486282348632813,
|
| 814 |
+
"DeadStoreElimination": 0.0063364505767822266,
|
| 815 |
+
"DelinearIndices": 0.0064697265625,
|
| 816 |
+
"Delinearization": 0.004486560821533203,
|
| 817 |
+
"DelinearizeSPMD": 0.01732611656188965,
|
| 818 |
+
"DoNothing": 9.441375732421875e-05,
|
| 819 |
+
"DramToDramTranspose": 0.02082037925720215,
|
| 820 |
+
"DumpGraphAndMetadata": 0.036411285400390625,
|
| 821 |
+
"EliminateDivs": 0.01006174087524414,
|
| 822 |
+
"ExpandBatchNorm": 0.0024886131286621094,
|
| 823 |
+
"ExpandISAMacro": 0.007379293441772461,
|
| 824 |
+
"FactorizeBlkDims": 0.023633480072021484,
|
| 825 |
+
"FactorizeThreadAxesInFreeDims": 0.0071103572845458984,
|
| 826 |
+
"FlattenMacroLoop": 0.009794235229492188,
|
| 827 |
+
"GenericAccessSimplifier": 0.0009224414825439453,
|
| 828 |
+
"InferInitValue": 0.12128233909606934,
|
| 829 |
+
"InferIntrinsicOnCC": 0.01005697250366211,
|
| 830 |
+
"InferNeuronTensor": 0.029047489166259766,
|
| 831 |
+
"InferNonlocalTensors": 0.017493009567260742,
|
| 832 |
+
"InferPSumTensor": 0.04303455352783203,
|
| 833 |
+
"InferShardAxis": 0.26027798652648926,
|
| 834 |
+
"InferSharedMemLoc": 0.012881040573120117,
|
| 835 |
+
"InlineNativeKernels": 0.002816915512084961,
|
| 836 |
+
"InsertCoreBarrier": 0.009889602661132813,
|
| 837 |
+
"InsertIOTransposes": 0.019797325134277344,
|
| 838 |
+
"InsertImplicitShardAxisBeforeISel": 0.05061173439025879,
|
| 839 |
+
"InsertLocalTransposes": 0.004299163818359375,
|
| 840 |
+
"InsertOffloadedTransposes": 0.008011579513549805,
|
| 841 |
+
"LICM": 0.009003639221191406,
|
| 842 |
+
"LateLegalizeInst": 0.013794183731079102,
|
| 843 |
+
"LateLegalizePostSplit": 0.013758182525634766,
|
| 844 |
+
"LateLowerReshapeOp": 0.0012693405151367188,
|
| 845 |
+
"LateLowerTensorOp": 0.002027750015258789,
|
| 846 |
+
"LateNeuronInstComb": 0.09844541549682617,
|
| 847 |
+
"LayoutPreprocessing": 0.025156497955322266,
|
| 848 |
+
"LayoutPreprocessingAndAnalysis": 0.06950831413269043,
|
| 849 |
+
"LayoutRequirementAnalysis": 0.0069408416748046875,
|
| 850 |
+
"LegalizeCCOpLayout": 0.003494739532470703,
|
| 851 |
+
"LegalizeOpLevelAlias": 0.0016810894012451172,
|
| 852 |
+
"LegalizePartitionReduce": 0.0026693344116210938,
|
| 853 |
+
"LegalizeSundaAccess": 0.0380399227142334,
|
| 854 |
+
"LegalizeSundaMacro": 0.10486245155334473,
|
| 855 |
+
"LegalizeType": 0.015400409698486328,
|
| 856 |
+
"LocalLayoutOpt": 0.012215137481689453,
|
| 857 |
+
"LoopFusion": 0.0049479007720947266,
|
| 858 |
+
"LoopSplitting": 0.0008144378662109375,
|
| 859 |
+
"LowerBroadcast": 0.0033435821533203125,
|
| 860 |
+
"LowerCCOpBlockAxis": 0.0037145614624023438,
|
| 861 |
+
"LowerComplexBroadcast": 0.0070230960845947266,
|
| 862 |
+
"LowerIntrinsics": 0.08174729347229004,
|
| 863 |
+
"LowerShardAxis": 0.020240068435668945,
|
| 864 |
+
"LowerTensorOp": 0.028459787368774414,
|
| 865 |
+
"LowerToSendRecv": 0.02129983901977539,
|
| 866 |
+
"LowerTranspose": 0.05583548545837402,
|
| 867 |
+
"MacroGeneration": 0.03631877899169922,
|
| 868 |
+
"MaskPropagation": 0.004620075225830078,
|
| 869 |
+
"MemcpyElimination": 0.04741477966308594,
|
| 870 |
+
"MutateDataType": 0.002264261245727539,
|
| 871 |
+
"NeuronAliasDependencyInduction": 0.002180337905883789,
|
| 872 |
+
"NeuronAliasDependencyReset": 0.08514618873596191,
|
| 873 |
+
"NeuronInstComb": 0.017351865768432617,
|
| 874 |
+
"NeuronLICM": 0.015241861343383789,
|
| 875 |
+
"NeuronLoopFusion": 0.05364656448364258,
|
| 876 |
+
"NeuronLoopInterchange": 0.002526521682739258,
|
| 877 |
+
"NeuronSimplifier": 0.06896662712097168,
|
| 878 |
+
"NeuronSimplifyPredicates": 0.023428916931152344,
|
| 879 |
+
"NeuronValueNumbering": 0.009569168090820313,
|
| 880 |
+
"OptimizeAliasedCopyChain": 0.0007548332214355469,
|
| 881 |
+
"OptimizeNKIKernels": 4.075549602508545,
|
| 882 |
+
"PAGLayoutOpt": 0.1111152172088623,
|
| 883 |
+
"PComputeCutting": 0.005707263946533203,
|
| 884 |
+
"PGLayoutTilingPipeline": 1.204958438873291,
|
| 885 |
+
"PGTiling": 0.4116194248199463,
|
| 886 |
+
"PadElimination": 0.0003600120544433594,
|
| 887 |
+
"ParAxesAnnotation": 0.050878286361694336,
|
| 888 |
+
"PartialLoopFusion": 0.0372469425201416,
|
| 889 |
+
"PartialSimdFusion": 0.021113157272338867,
|
| 890 |
+
"PerfectLoopNest": 0.007718086242675781,
|
| 891 |
+
"RecognizeOpIdiom": 0.0058002471923828125,
|
| 892 |
+
"Recompute": 0.0017511844635009766,
|
| 893 |
+
"RelaxPredicates": 0.00795745849609375,
|
| 894 |
+
"Rematerialization": 0.0019276142120361328,
|
| 895 |
+
"RemoveShardedPartitionAxes": 0.008410930633544922,
|
| 896 |
+
"ReshapeWeights": 0.0063934326171875,
|
| 897 |
+
"ResolveAccessConflict": 0.01411294937133789,
|
| 898 |
+
"ResolveComplicatePredicates": 0.004876375198364258,
|
| 899 |
+
"RewriteReplicationMatmul": 0.0017600059509277344,
|
| 900 |
+
"RewriteWeights": 0.004542827606201172,
|
| 901 |
+
"SFKVectorizer": 0.23946118354797363,
|
| 902 |
+
"ShardingPropagationAnalysis": 0.06259655952453613,
|
| 903 |
+
"SimpleAllReduceTiling": 0.004370212554931641,
|
| 904 |
+
"Simplifier": 0.0033507347106933594,
|
| 905 |
+
"SimplifyMacroPredicates": 0.056143999099731445,
|
| 906 |
+
"SimplifyNeuronTensor": 0.020067691802978516,
|
| 907 |
+
"SimplifySlice": 0.001861572265625,
|
| 908 |
+
"SimplifyTensor": 0.02954578399658203,
|
| 909 |
+
"SpillPSum": 0.03782367706298828,
|
| 910 |
+
"SplitAPUnionSets": 0.07312703132629395,
|
| 911 |
+
"SplitAccGrp": 0.002663135528564453,
|
| 912 |
+
"StaticProfiler": 0.02257680892944336,
|
| 913 |
+
"StaticTransposeLocalTensor": 0.003572225570678711,
|
| 914 |
+
"SundaISel": 0.10315561294555664,
|
| 915 |
+
"TCTransform": 0.0025663375854492188,
|
| 916 |
+
"TensorInitialization": 0.00860285758972168,
|
| 917 |
+
"TensorOpSimplifier": 0.008630037307739258,
|
| 918 |
+
"TensorOpTransform": 0.028581619262695313,
|
| 919 |
+
"TileCCOps": 0.00518488883972168,
|
| 920 |
+
"TilingProfiler": 0.023342609405517578,
|
| 921 |
+
"TransformConvOp": 0.008756637573242188,
|
| 922 |
+
"TritiumFusion": 0.13446974754333496,
|
| 923 |
+
"ValueNumbering": 0.003237485885620117,
|
| 924 |
+
"VectorizeDMA": 0.028183698654174805,
|
| 925 |
+
"VectorizeMatMult": 0.015199661254882813,
|
| 926 |
+
"WeightCoalescing": 0.0020062923431396484,
|
| 927 |
+
"ZeroSizeTensorElimination": 0.0001671314239501953
|
| 928 |
+
},
|
| 929 |
+
"tensorizer": {
|
| 930 |
+
"DMATilingProfiler::TotalInstructionsAfterTiling": 20773.0,
|
| 931 |
+
"StaticProfiler::AifUb": 131.73849487304688,
|
| 932 |
+
"StaticProfiler::ArithmeticIntensityTensorizer": 127.46285247802734,
|
| 933 |
+
"StaticProfiler::AverageDmaLength": 2400.2490234375,
|
| 934 |
+
"StaticProfiler::AverageFractalPeUtilization": 98.70232391357422,
|
| 935 |
+
"StaticProfiler::AveragePartitionUtilization": 94.02606201171875,
|
| 936 |
+
"StaticProfiler::AveragePeUtilization": 96.57791900634766,
|
| 937 |
+
"StaticProfiler::DDRTransferBytes": 361746464.0,
|
| 938 |
+
"StaticProfiler::InternalTransferBytes": 320526112.0,
|
| 939 |
+
"StaticProfiler::LoadExpanded": 84060.0,
|
| 940 |
+
"StaticProfiler::LocalizationEfficiency": 96.75444030761719,
|
| 941 |
+
"StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 99.23246002197266,
|
| 942 |
+
"StaticProfiler::StoreExpanded": 1898.0,
|
| 943 |
+
"StaticProfiler::TotalDMAExpanded": 85958.0,
|
| 944 |
+
"StaticProfiler::TotalDynamicInstancesCount": 25131.0,
|
| 945 |
+
"StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 24680.0,
|
| 946 |
+
"StaticProfiler::TotalLNCComm": 0.0,
|
| 947 |
+
"StaticProfiler::TotalLNCCommTransfer": 0.0,
|
| 948 |
+
"TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0,
|
| 949 |
+
"TilingProfiler::AveragePeUtilizationAfterTiling": 0.0,
|
| 950 |
+
"TilingProfiler::BatchnormInstructionsAfterTiling": 0.0,
|
| 951 |
+
"TilingProfiler::DmaInstructionsAfterTiling": 0.0,
|
| 952 |
+
"TilingProfiler::GenericInstructionsAfterTiling": 4.0,
|
| 953 |
+
"TilingProfiler::MatMultInstructionsAfterTiling": 10368.0,
|
| 954 |
+
"TilingProfiler::NumPfTransposes": 6.0,
|
| 955 |
+
"TilingProfiler::NumPfTransposesForIo": 1.0,
|
| 956 |
+
"TilingProfiler::NumPfTransposesForLocal": 1.0,
|
| 957 |
+
"TilingProfiler::NumPfTransposesForNonlocal": 4.0,
|
| 958 |
+
"TilingProfiler::PfTransposeInstructions": 10147.0,
|
| 959 |
+
"TilingProfiler::PfTransposeInstructionsForIo": 9504.0,
|
| 960 |
+
"TilingProfiler::PfTransposeInstructionsForLocal": 1.0,
|
| 961 |
+
"TilingProfiler::PfTransposeInstructionsForNonlocal": 642.0,
|
| 962 |
+
"TilingProfiler::ReduceInstructionsAfterTiling": 4.0,
|
| 963 |
+
"TilingProfiler::SimdInstructionsAfterTiling": 92.0,
|
| 964 |
+
"TilingProfiler::TotalInstructionsAfterTiling": 0.0,
|
| 965 |
+
"TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0,
|
| 966 |
+
"TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0,
|
| 967 |
+
"TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0,
|
| 968 |
+
"TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0,
|
| 969 |
+
"TransformConvOp::conv2d_column_packing": 0.0,
|
| 970 |
+
"TransformConvOp::conv2d_column_packing_1": 0.0,
|
| 971 |
+
"TransformConvOp::conv2d_column_packing_io10": 0.0,
|
| 972 |
+
"TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0
|
| 973 |
+
}
|
| 974 |
+
},
|
| 975 |
+
"sg01": {
|
| 976 |
+
"compiletime": {
|
| 977 |
+
"CanonicalizeConv": 1.8000000636675395e-05,
|
| 978 |
+
"CanonicalizeForTensorizer": 1.1000000085914508e-05,
|
| 979 |
+
"Canonicalizer": 0.0002209999947808683,
|
| 980 |
+
"HoistCompute": 3.999999989900971e-06,
|
| 981 |
+
"IdentifyCrossPassTensors": 1.2000000424450263e-05,
|
| 982 |
+
"MemcastMotion": 7.999999979801942e-06,
|
| 983 |
+
"PenguinizeFunctions": 9.000000318337698e-06,
|
| 984 |
+
"PruneFunctions": 1.2000000424450263e-05,
|
| 985 |
+
"RemoveOptimizationBarriers": 2.9000000722589903e-05,
|
| 986 |
+
"ScatterMotion": 3.099999958067201e-05,
|
| 987 |
+
"TensorizerLegalizationPass": 1.2000000424450263e-05,
|
| 988 |
+
"VerifySupportedOps": 9.999999747378752e-06,
|
| 989 |
+
"algsimp": 4.999999873689376e-05,
|
| 990 |
+
"batchnorm_expander": 1.2000000424450263e-05,
|
| 991 |
+
"boundary-marker-removal": 3.999999989900971e-06,
|
| 992 |
+
"call-inliner": 7.999999979801942e-06,
|
| 993 |
+
"canonicalize-boundary-marker": 4.999999873689376e-06,
|
| 994 |
+
"collective-stream-id-checker": 3.000000106112566e-06,
|
| 995 |
+
"comparison-expander": 4.999999873689376e-06,
|
| 996 |
+
"computation-deduplicator": 1.700000029813964e-05,
|
| 997 |
+
"config-lowering": 0.00010399999882793054,
|
| 998 |
+
"constant_folding": 7.999999979801942e-06,
|
| 999 |
+
"cse": 1.1000000085914508e-05,
|
| 1000 |
+
"dce": 9.999999974752427e-07,
|
| 1001 |
+
"dynamic-slice-transpose": 3.999999989900971e-06,
|
| 1002 |
+
"eliminate-redundant-compare": 3.999999989900971e-06,
|
| 1003 |
+
"emit-offloaded-dropout": 2.499999936844688e-05,
|
| 1004 |
+
"flatten-call-graph": 7.000000096013537e-06,
|
| 1005 |
+
"fuse-send-recv": 1.8999999156221747e-05,
|
| 1006 |
+
"hilo-conditional-to-select": 3.999999989900971e-06,
|
| 1007 |
+
"hilo::LegalizeAlias": 3.999999989900971e-06,
|
| 1008 |
+
"hilo::NeuronInstCombine": 5.2999999752501026e-05,
|
| 1009 |
+
"hilo::NeuronOpFusion": 3.899999865097925e-05,
|
| 1010 |
+
"hilo::ReplaceTokenTypeWithU8Pass": 7.000000096013537e-06,
|
| 1011 |
+
"hilo::ScheduleFusion": 9.999999974752427e-07,
|
| 1012 |
+
"hilo::SixtyFourHack": 9.000000318337698e-06,
|
| 1013 |
+
"hilo::VerifyAliasing": 9.999999974752427e-07,
|
| 1014 |
+
"hlo-mac-count": 1.8999999156221747e-05,
|
| 1015 |
+
"legalize-ccops-for-tensorizer": 9.999999974752427e-07,
|
| 1016 |
+
"legalize-compare": 3.999999989900971e-06,
|
| 1017 |
+
"lower-argminmax-custom-call": 3.000000106112566e-06,
|
| 1018 |
+
"map-inline": 9.999999747378752e-06,
|
| 1019 |
+
"metadata-naming": 1.700000029813964e-05,
|
| 1020 |
+
"mlir::detail::OpToOpPassAdaptor": 1.9999999494757503e-05,
|
| 1021 |
+
"mlir::hlo::MhloToPyPenguin": 0.0009130000253207982,
|
| 1022 |
+
"mlir::mhlo::LowerComplexExtraPass": 7.200000254670158e-05,
|
| 1023 |
+
"mlir::mhlo::LowerComplexPass": 0.0001250000059371814,
|
| 1024 |
+
"native-to-custom-softmax": 4.999999873689376e-06,
|
| 1025 |
+
"native-to-custom-softmax-dx": 1.1000000085914508e-05,
|
| 1026 |
+
"neuron-hlo-verifier": 0.00036299999919719994,
|
| 1027 |
+
"operand_upcaster": 1.4000000192027073e-05,
|
| 1028 |
+
"post-par-pipe-begin": 1.9999999949504854e-06,
|
| 1029 |
+
"post-par-pipe-end": 0.0,
|
| 1030 |
+
"post-partition-simplification": 0.0004330000083427876,
|
| 1031 |
+
"replace-minimum-constant": 4.999999873689376e-06,
|
| 1032 |
+
"reshape-mover": 3.000000106112566e-06,
|
| 1033 |
+
"simplify-concat": 3.7000001611886546e-05,
|
| 1034 |
+
"simplify-while-loops": 1.9999999949504854e-06,
|
| 1035 |
+
"transform-variadic-reduce": 7.000000096013537e-06,
|
| 1036 |
+
"tuple-simplifier": 3.999999989900971e-06,
|
| 1037 |
+
"unpack-nested-aws-ntwsr": 3.000000106112566e-06,
|
| 1038 |
+
"unroll-while-loop": 0.0
|
| 1039 |
+
},
|
| 1040 |
+
"hilo": {
|
| 1041 |
+
"ArithmeticIntensity": 53.940223693847656,
|
| 1042 |
+
"HloMacCount": 3254779904.0,
|
| 1043 |
+
"Traffic": 120680992.0
|
| 1044 |
+
}
|
| 1045 |
+
},
|
| 1046 |
+
"sg02": {
|
| 1047 |
+
"compiletime": {
|
| 1048 |
+
"CanonicalizeConv": 1.9999999949504854e-06,
|
| 1049 |
+
"CanonicalizeForTensorizer": 1.2000000424450263e-05,
|
| 1050 |
+
"Canonicalizer": 0.0003380000125616789,
|
| 1051 |
+
"HoistCompute": 9.999999974752427e-07,
|
| 1052 |
+
"IdentifyCrossPassTensors": 9.999999747378752e-06,
|
| 1053 |
+
"MemcastMotion": 1.1000000085914508e-05,
|
| 1054 |
+
"PenguinizeFunctions": 7.999999979801942e-06,
|
| 1055 |
+
"PruneFunctions": 7.000000096013537e-06,
|
| 1056 |
+
"RemoveOptimizationBarriers": 2.300000051036477e-05,
|
| 1057 |
+
"ScatterMotion": 4.999999873689376e-06,
|
| 1058 |
+
"TensorizerLegalizationPass": 6.000000212225132e-06,
|
| 1059 |
+
"VerifySupportedOps": 1.2000000424450263e-05,
|
| 1060 |
+
"algsimp": 4.8000001697801054e-05,
|
| 1061 |
+
"batchnorm_expander": 1.1000000085914508e-05,
|
| 1062 |
+
"boundary-marker-removal": 3.999999989900971e-06,
|
| 1063 |
+
"call-inliner": 9.000000318337698e-06,
|
| 1064 |
+
"canonicalize-boundary-marker": 4.999999873689376e-06,
|
| 1065 |
+
"collective-stream-id-checker": 1.9999999949504854e-06,
|
| 1066 |
+
"comparison-expander": 4.999999873689376e-06,
|
| 1067 |
+
"computation-deduplicator": 1.8000000636675395e-05,
|
| 1068 |
+
"config-lowering": 3.600000127335079e-05,
|
| 1069 |
+
"constant_folding": 7.000000096013537e-06,
|
| 1070 |
+
"cse": 1.4000000192027073e-05,
|
| 1071 |
+
"dce": 9.999999974752427e-07,
|
| 1072 |
+
"dynamic-slice-transpose": 3.999999989900971e-06,
|
| 1073 |
+
"eliminate-redundant-compare": 3.000000106112566e-06,
|
| 1074 |
+
"emit-offloaded-dropout": 1.4000000192027073e-05,
|
| 1075 |
+
"flatten-call-graph": 9.999999747378752e-06,
|
| 1076 |
+
"fuse-send-recv": 1.4999999621068127e-05,
|
| 1077 |
+
"hilo-conditional-to-select": 6.000000212225132e-06,
|
| 1078 |
+
"hilo::LegalizeAlias": 1.9999999949504854e-06,
|
| 1079 |
+
"hilo::NeuronInstCombine": 1.2000000424450263e-05,
|
| 1080 |
+
"hilo::NeuronOpFusion": 1.1000000085914508e-05,
|
| 1081 |
+
"hilo::ReplaceTokenTypeWithU8Pass": 1.2999999853491317e-05,
|
| 1082 |
+
"hilo::ScheduleFusion": 3.999999989900971e-06,
|
| 1083 |
+
"hilo::SixtyFourHack": 3.899999865097925e-05,
|
| 1084 |
+
"hilo::VerifyAliasing": 9.999999974752427e-07,
|
| 1085 |
+
"hlo-mac-count": 0.004476999863982201,
|
| 1086 |
+
"legalize-ccops-for-tensorizer": 9.999999974752427e-07,
|
| 1087 |
+
"legalize-compare": 3.000000106112566e-06,
|
| 1088 |
+
"lower-argminmax-custom-call": 3.999999989900971e-06,
|
| 1089 |
+
"map-inline": 1.1000000085914508e-05,
|
| 1090 |
+
"metadata-naming": 1.2999999853491317e-05,
|
| 1091 |
+
"mlir::detail::OpToOpPassAdaptor": 2.9999999242136255e-05,
|
| 1092 |
+
"mlir::hlo::MhloToPyPenguin": 0.004188999999314547,
|
| 1093 |
+
"mlir::mhlo::LowerComplexExtraPass": 9.000000136438757e-05,
|
| 1094 |
+
"mlir::mhlo::LowerComplexPass": 0.000155999994603917,
|
| 1095 |
+
"native-to-custom-softmax": 4.999999873689376e-06,
|
| 1096 |
+
"native-to-custom-softmax-dx": 1.8000000636675395e-05,
|
| 1097 |
+
"neuron-hlo-verifier": 0.00033400001120753586,
|
| 1098 |
+
"operand_upcaster": 1.1000000085914508e-05,
|
| 1099 |
+
"post-par-pipe-begin": 9.999999974752427e-07,
|
| 1100 |
+
"post-par-pipe-end": 0.0,
|
| 1101 |
+
"post-partition-simplification": 0.0004529999860096723,
|
| 1102 |
+
"replace-minimum-constant": 7.000000096013537e-06,
|
| 1103 |
+
"reshape-mover": 3.000000106112566e-06,
|
| 1104 |
+
"simplify-concat": 3.7999998312443495e-05,
|
| 1105 |
+
"simplify-while-loops": 1.9999999949504854e-06,
|
| 1106 |
+
"transform-variadic-reduce": 4.199999966658652e-05,
|
| 1107 |
+
"tuple-simplifier": 3.999999989900971e-06,
|
| 1108 |
+
"unpack-nested-aws-ntwsr": 3.999999989900971e-06,
|
| 1109 |
+
"unroll-while-loop": 0.0
|
| 1110 |
+
},
|
| 1111 |
+
"hilo": {
|
| 1112 |
+
"ArithmeticIntensity": 14.666111946105957,
|
| 1113 |
+
"HloMacCount": 2572550144.0,
|
| 1114 |
+
"Traffic": 350815552.0
|
| 1115 |
+
}
|
| 1116 |
+
},
|
| 1117 |
+
"topk": {
|
| 1118 |
+
"compiletime": {
|
| 1119 |
+
"CoalesceCCOp": 0.006727457046508789,
|
| 1120 |
+
"DMALocalityOpt": 0.009476661682128906,
|
| 1121 |
+
"DMAProfiler": 0.006308317184448242,
|
| 1122 |
+
"DataStreaming": 0.029163122177124023,
|
| 1123 |
+
"DoNothing": 0.0004937648773193359,
|
| 1124 |
+
"ExpandISAMacro": 0.006926536560058594,
|
| 1125 |
+
"FactorizeBlkDims": 0.049018144607543945,
|
| 1126 |
+
"InferPSumTensor": 0.049260616302490234,
|
| 1127 |
+
"InferSharedMemLoc": 0.003329038619995117,
|
| 1128 |
+
"InsertCoreBarrier": 0.0059740543365478516,
|
| 1129 |
+
"LateLegalizeInst": 0.019405364990234375,
|
| 1130 |
+
"LateNeuronInstComb": 0.04540205001831055,
|
| 1131 |
+
"LegalizeSundaAccess": 0.046309709548950195,
|
| 1132 |
+
"LegalizeType": 0.05346846580505371,
|
| 1133 |
+
"LowerBroadcast": 0.015480279922485352,
|
| 1134 |
+
"LowerIntrinsics": 0.007883310317993164,
|
| 1135 |
+
"LowerTranspose": 0.010731220245361328,
|
| 1136 |
+
"NeuronInstComb": 0.03727889060974121,
|
| 1137 |
+
"NeuronLICM": 0.03041553497314453,
|
| 1138 |
+
"NeuronSimplifyPredicates": 0.006567955017089844,
|
| 1139 |
+
"NeuronValueNumbering": 0.015464067459106445,
|
| 1140 |
+
"SFKVectorizer": 0.07225155830383301,
|
| 1141 |
+
"SimpleAllReduceTiling": 0.006035804748535156,
|
| 1142 |
+
"SimplifyNeuronTensor": 0.11353325843811035,
|
| 1143 |
+
"SpillPSum": 0.0760800838470459,
|
| 1144 |
+
"WeightCoalescing": 0.01406407356262207
|
| 1145 |
+
}
|
| 1146 |
+
}
|
| 1147 |
+
}
|
context_encoding_model/_tp0_bk0/graph.neff
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:98834cf4cd3214e9f9fc84530eed5ef31b01fda5919c60b959ca4a30bcb80d0c
|
| 3 |
+
size 1188864
|
context_encoding_model/_tp0_bk0/log-neuron-cc.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
context_encoding_model/_tp0_bk0/metaneff.pb
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ec75ee80b2ec3909e8e315fa6044902ec93fdb3a62229b909f551426d04c56b6
|
| 3 |
+
size 2077993
|
context_encoding_model/_tp0_bk0/model.MODULE_e7e1b6c43bb87ca73ecc+2ee9f01d.hlo_module.pb
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b77f309407f7c741dd9b51614fc850fa657ce4e6ca40a18b4471f2b477760976
|
| 3 |
+
size 2163092
|
context_encoding_model/_tp0_bk0/model.MODULE_e7e1b6c43bb87ca73ecc+2ee9f01d.neff
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:98834cf4cd3214e9f9fc84530eed5ef31b01fda5919c60b959ca4a30bcb80d0c
|
| 3 |
+
size 1188864
|
context_encoding_model/_tp0_bk0/neuron_config.json
ADDED
|
@@ -0,0 +1,224 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_attn_implementation_autoset": false,
|
| 3 |
+
"_name_or_path": "/home/ubuntu/models/Qwen3-1.7B",
|
| 4 |
+
"add_cross_attention": false,
|
| 5 |
+
"architectures": [
|
| 6 |
+
"Qwen3ForCausalLM"
|
| 7 |
+
],
|
| 8 |
+
"attention_bias": false,
|
| 9 |
+
"attention_dropout": 0.0,
|
| 10 |
+
"attribute_map": {},
|
| 11 |
+
"bad_words_ids": null,
|
| 12 |
+
"begin_suppress_tokens": null,
|
| 13 |
+
"bos_token_id": 151643,
|
| 14 |
+
"chunk_size_feed_forward": 0,
|
| 15 |
+
"cross_attention_hidden_size": null,
|
| 16 |
+
"decoder_start_token_id": null,
|
| 17 |
+
"diversity_penalty": 0.0,
|
| 18 |
+
"do_sample": false,
|
| 19 |
+
"early_stopping": false,
|
| 20 |
+
"encoder_no_repeat_ngram_size": 0,
|
| 21 |
+
"eos_token_id": 151645,
|
| 22 |
+
"exponential_decay_length_penalty": null,
|
| 23 |
+
"finetuning_task": null,
|
| 24 |
+
"forced_bos_token_id": null,
|
| 25 |
+
"forced_eos_token_id": null,
|
| 26 |
+
"fused_spec_config": null,
|
| 27 |
+
"head_dim": 128,
|
| 28 |
+
"hidden_act": "silu",
|
| 29 |
+
"hidden_size": 2048,
|
| 30 |
+
"id2label": {
|
| 31 |
+
"0": "LABEL_0",
|
| 32 |
+
"1": "LABEL_1"
|
| 33 |
+
},
|
| 34 |
+
"initializer_range": 0.02,
|
| 35 |
+
"intermediate_size": 6144,
|
| 36 |
+
"is_decoder": false,
|
| 37 |
+
"is_encoder_decoder": false,
|
| 38 |
+
"label2id": {
|
| 39 |
+
"LABEL_0": 0,
|
| 40 |
+
"LABEL_1": 1
|
| 41 |
+
},
|
| 42 |
+
"length_penalty": 1.0,
|
| 43 |
+
"max_length": 20,
|
| 44 |
+
"max_position_embeddings": 40960,
|
| 45 |
+
"max_window_layers": 28,
|
| 46 |
+
"metadata": null,
|
| 47 |
+
"min_length": 0,
|
| 48 |
+
"model_type": "qwen3",
|
| 49 |
+
"neuron_config": {
|
| 50 |
+
"activation_quantization_type": null,
|
| 51 |
+
"allow_input_truncation": false,
|
| 52 |
+
"apply_seq_ids_mask": false,
|
| 53 |
+
"async_mode": false,
|
| 54 |
+
"attention_dp_degree": 1,
|
| 55 |
+
"attention_dtype": null,
|
| 56 |
+
"attn_block_cte_nki_kernel_enabled": false,
|
| 57 |
+
"attn_block_tkg_nki_kernel_cache_update": false,
|
| 58 |
+
"attn_block_tkg_nki_kernel_cascaded_attention": false,
|
| 59 |
+
"attn_block_tkg_nki_kernel_enabled": false,
|
| 60 |
+
"attn_cls": {
|
| 61 |
+
"__module__": "neuronx_distributed_inference.models.qwen3.modeling_qwen3",
|
| 62 |
+
"__name__": "NeuronQwen3Attention"
|
| 63 |
+
},
|
| 64 |
+
"attn_kernel_enabled": null,
|
| 65 |
+
"attn_tkg_builtin_kernel_enabled": false,
|
| 66 |
+
"attn_tkg_nki_kernel_enabled": false,
|
| 67 |
+
"batch_size": 1,
|
| 68 |
+
"bucket_n_active_tokens": true,
|
| 69 |
+
"buckets": [
|
| 70 |
+
128
|
| 71 |
+
],
|
| 72 |
+
"cast_type": "config",
|
| 73 |
+
"cc_pipeline_tiling_factor": 2,
|
| 74 |
+
"chunked_prefill_config": null,
|
| 75 |
+
"context_encoding_buckets": [
|
| 76 |
+
128
|
| 77 |
+
],
|
| 78 |
+
"cp_degree": 1,
|
| 79 |
+
"ctx_batch_size": 1,
|
| 80 |
+
"disable_kv_cache_tiling": false,
|
| 81 |
+
"draft_model_modules_to_not_convert": null,
|
| 82 |
+
"enable_bucketing": true,
|
| 83 |
+
"enable_cte_modular_flow": false,
|
| 84 |
+
"enable_eagle_draft_input_norm": false,
|
| 85 |
+
"enable_eagle_speculation": false,
|
| 86 |
+
"enable_fused_speculation": false,
|
| 87 |
+
"enable_long_context_mode": false,
|
| 88 |
+
"enable_output_completion_notifications": false,
|
| 89 |
+
"enable_spill_reload_dge": false,
|
| 90 |
+
"enable_token_tree": false,
|
| 91 |
+
"ep_degree": 1,
|
| 92 |
+
"expert_mlp_nki_kernel_enabled": null,
|
| 93 |
+
"flash_decoding_enabled": false,
|
| 94 |
+
"fused_qkv": false,
|
| 95 |
+
"fused_rmsnorm_skip_gamma": false,
|
| 96 |
+
"is_block_kv_layout": null,
|
| 97 |
+
"is_chunked_prefill": false,
|
| 98 |
+
"is_continuous_batching": true,
|
| 99 |
+
"is_eagle_draft": false,
|
| 100 |
+
"is_medusa": false,
|
| 101 |
+
"is_prefill_stage": true,
|
| 102 |
+
"is_prefix_caching": false,
|
| 103 |
+
"k_cache_transposed": false,
|
| 104 |
+
"kv_cache_batch_size": 8,
|
| 105 |
+
"kv_cache_padding_size": 0,
|
| 106 |
+
"kv_cache_quant": false,
|
| 107 |
+
"kv_cache_tiling": false,
|
| 108 |
+
"layer_boundary_markers": false,
|
| 109 |
+
"lm_head_pad": true,
|
| 110 |
+
"lm_head_pad_alignment_size": 1,
|
| 111 |
+
"local_ranks_size": 2,
|
| 112 |
+
"logical_nc_config": 2,
|
| 113 |
+
"lora_config": null,
|
| 114 |
+
"max_batch_size": 8,
|
| 115 |
+
"max_context_length": 4096,
|
| 116 |
+
"max_length": 4096,
|
| 117 |
+
"max_new_tokens": null,
|
| 118 |
+
"medusa_speculation_length": 0,
|
| 119 |
+
"medusa_tree": null,
|
| 120 |
+
"mlp_kernel_enabled": false,
|
| 121 |
+
"mlp_kernel_fuse_residual_add": false,
|
| 122 |
+
"modules_to_not_convert": null,
|
| 123 |
+
"moe_fused_nki_kernel_enabled": null,
|
| 124 |
+
"n_active_tokens": 4096,
|
| 125 |
+
"n_positions": 4096,
|
| 126 |
+
"num_medusa_heads": 0,
|
| 127 |
+
"on_cpu": false,
|
| 128 |
+
"on_device_sampling_config": {
|
| 129 |
+
"deterministic": false,
|
| 130 |
+
"do_sample": false,
|
| 131 |
+
"dynamic": true,
|
| 132 |
+
"global_topk": 256,
|
| 133 |
+
"on_device_sampling_config": true,
|
| 134 |
+
"temperature": 1.0,
|
| 135 |
+
"top_k": 1,
|
| 136 |
+
"top_k_kernel_enabled": false,
|
| 137 |
+
"top_p": 1.0
|
| 138 |
+
},
|
| 139 |
+
"output_logits": false,
|
| 140 |
+
"overrides_torch_dtype": true,
|
| 141 |
+
"pa_block_size": 4096,
|
| 142 |
+
"pa_num_blocks": 8,
|
| 143 |
+
"padding_side": "right",
|
| 144 |
+
"pp_degree": 1,
|
| 145 |
+
"prefix_buckets": null,
|
| 146 |
+
"qk_layernorm": false,
|
| 147 |
+
"qkv_kernel_enabled": false,
|
| 148 |
+
"qkv_kernel_fuse_residual_add": false,
|
| 149 |
+
"qkv_kernel_nbsd_layout": false,
|
| 150 |
+
"quantization_dtype": "int8",
|
| 151 |
+
"quantization_type": "per_tensor_symmetric",
|
| 152 |
+
"quantize_clamp_bound": Infinity,
|
| 153 |
+
"quantized": false,
|
| 154 |
+
"quantized_checkpoints_path": null,
|
| 155 |
+
"quantized_mlp_kernel_enabled": false,
|
| 156 |
+
"rmsnorm_quantize_kernel_enabled": false,
|
| 157 |
+
"router_topk_nki_kernel_enabled": null,
|
| 158 |
+
"rpl_reduce_dtype": null,
|
| 159 |
+
"save_sharded_checkpoint": true,
|
| 160 |
+
"scratchpad_page_size": null,
|
| 161 |
+
"seq_len": 4096,
|
| 162 |
+
"seq_len_threshold_for_cc_tiling": 16384,
|
| 163 |
+
"sequence_parallel_enabled": false,
|
| 164 |
+
"shared_mlp_nki_kernel_enabled": null,
|
| 165 |
+
"skip_sharding": false,
|
| 166 |
+
"skip_warmup": false,
|
| 167 |
+
"spec_batch_size": 8,
|
| 168 |
+
"speculation_length": 0,
|
| 169 |
+
"start_rank_id": 0,
|
| 170 |
+
"strided_context_parallel_kernel_enabled": false,
|
| 171 |
+
"target": null,
|
| 172 |
+
"tensor_capture_config": null,
|
| 173 |
+
"tile_cc": false,
|
| 174 |
+
"tkg_batch_size": 8,
|
| 175 |
+
"token_generation_buckets": null,
|
| 176 |
+
"token_tree_config": null,
|
| 177 |
+
"torch_dtype": "bfloat16",
|
| 178 |
+
"tp_degree": 2,
|
| 179 |
+
"vocab_parallel": false,
|
| 180 |
+
"weight_gather_seq_len_threshold": 32768,
|
| 181 |
+
"weights_to_skip_layout_optimization": [],
|
| 182 |
+
"world_size": 2
|
| 183 |
+
},
|
| 184 |
+
"no_repeat_ngram_size": 0,
|
| 185 |
+
"num_attention_heads": 16,
|
| 186 |
+
"num_beam_groups": 1,
|
| 187 |
+
"num_beams": 1,
|
| 188 |
+
"num_cores_per_group": 1,
|
| 189 |
+
"num_hidden_layers": 28,
|
| 190 |
+
"num_key_value_heads": 8,
|
| 191 |
+
"num_return_sequences": 1,
|
| 192 |
+
"output_attentions": false,
|
| 193 |
+
"output_hidden_states": false,
|
| 194 |
+
"output_scores": false,
|
| 195 |
+
"pad_token_id": 0,
|
| 196 |
+
"prefix": null,
|
| 197 |
+
"problem_type": null,
|
| 198 |
+
"pruned_heads": {},
|
| 199 |
+
"remove_invalid_values": false,
|
| 200 |
+
"repetition_penalty": 1.0,
|
| 201 |
+
"return_dict": true,
|
| 202 |
+
"return_dict_in_generate": false,
|
| 203 |
+
"rms_norm_eps": 1e-06,
|
| 204 |
+
"rope_scaling": null,
|
| 205 |
+
"rope_theta": 1000000,
|
| 206 |
+
"sep_token_id": null,
|
| 207 |
+
"sliding_window": null,
|
| 208 |
+
"suppress_tokens": null,
|
| 209 |
+
"task_specific_params": null,
|
| 210 |
+
"temperature": 1.0,
|
| 211 |
+
"tf_legacy_loss": false,
|
| 212 |
+
"tie_encoder_decoder": false,
|
| 213 |
+
"tie_word_embeddings": true,
|
| 214 |
+
"tokenizer_class": null,
|
| 215 |
+
"top_k": 50,
|
| 216 |
+
"top_p": 1.0,
|
| 217 |
+
"torchscript": false,
|
| 218 |
+
"transformers_version": "4.51.0",
|
| 219 |
+
"typical_p": 1.0,
|
| 220 |
+
"use_bfloat16": false,
|
| 221 |
+
"use_cache": true,
|
| 222 |
+
"use_sliding_window": false,
|
| 223 |
+
"vocab_size": 151936
|
| 224 |
+
}
|
context_encoding_model/_tp0_bk1/command.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
neuronx-cc compile --framework=XLA model.MODULE_2330bfb0632c950ddab1+62ecd68b.hlo_module.pb --output model.MODULE_2330bfb0632c950ddab1+62ecd68b.neff --target=trn2 --auto-cast=none --model-type=transformer '--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma ' --lnc=2 -O1 '--internal-hlo2tensorizer-options= --modular-flow-mac-threshold=10 --verify-hlo=true' --logfile=log-neuron-cc.txt --verbose=35
|
context_encoding_model/_tp0_bk1/compile_flags.MODULE_2330bfb0632c950ddab1+62ecd68b.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
["--target=trn2", "--auto-cast=none", "--model-type=transformer", "--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma ", "--lnc=2", "-O1", "--internal-hlo2tensorizer-options= --modular-flow-mac-threshold=10 --verify-hlo=true", "--logfile=/home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk1/log-neuron-cc.txt"]
|
context_encoding_model/_tp0_bk1/global_metric_store.json
ADDED
|
@@ -0,0 +1,1177 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"Average": {
|
| 3 |
+
"tensorizer": {
|
| 4 |
+
"StaticProfiler::AverageFractalPeUtilization": 98.71436309814453,
|
| 5 |
+
"StaticProfiler::AveragePartitionUtilization": 94.08551025390625,
|
| 6 |
+
"StaticProfiler::AveragePeUtilization": 96.60899353027344,
|
| 7 |
+
"StaticProfiler::LocalizationEfficiency": 95.931884765625,
|
| 8 |
+
"StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 99.52960968017578,
|
| 9 |
+
"TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0,
|
| 10 |
+
"TilingProfiler::AveragePeUtilizationAfterTiling": 0.0
|
| 11 |
+
}
|
| 12 |
+
},
|
| 13 |
+
"Count": {
|
| 14 |
+
"tensorizer": {
|
| 15 |
+
"StaticProfiler::AverageFractalPeUtilization": 1.0,
|
| 16 |
+
"StaticProfiler::AveragePartitionUtilization": 1.0,
|
| 17 |
+
"StaticProfiler::AveragePeUtilization": 1.0,
|
| 18 |
+
"StaticProfiler::LocalizationEfficiency": 1.0,
|
| 19 |
+
"StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 1.0,
|
| 20 |
+
"TilingProfiler::AveragePartitionUtilizationAfterTiling": 1.0,
|
| 21 |
+
"TilingProfiler::AveragePeUtilizationAfterTiling": 1.0
|
| 22 |
+
}
|
| 23 |
+
},
|
| 24 |
+
"Sum": {
|
| 25 |
+
"compiletime": {
|
| 26 |
+
"AGOrderingAnalysisPass": 0.08984947204589844,
|
| 27 |
+
"AffinePredicateResolution": 0.0009312629699707031,
|
| 28 |
+
"AliasDependencyElimination": 0.00024366378784179688,
|
| 29 |
+
"AliasDependencyInduction": 0.005263328552246094,
|
| 30 |
+
"AliasDependencyReset": 0.04176759719848633,
|
| 31 |
+
"BFComputeCutting": 0.002216339111328125,
|
| 32 |
+
"BirCodeGenLoop": 0.3660314083099365,
|
| 33 |
+
"CCOpFusion": 0.04759931564331055,
|
| 34 |
+
"CanonicalizeConv": 4.999999873689376e-05,
|
| 35 |
+
"CanonicalizeDAGForPGTiling": 0.006819009780883789,
|
| 36 |
+
"CanonicalizeForTensorizer": 3.7000001611886546e-05,
|
| 37 |
+
"CanonicalizeIR": 0.0015099048614501953,
|
| 38 |
+
"Canonicalizer": 0.0008099999977275729,
|
| 39 |
+
"CoalesceCCOp": 0.014320611953735352,
|
| 40 |
+
"CommuteConcat": 0.0021598339080810547,
|
| 41 |
+
"DMALocalityOpt": 0.006499767303466797,
|
| 42 |
+
"DMAProfiler": 0.031740665435791016,
|
| 43 |
+
"DMATilingProfiler": 0.007287263870239258,
|
| 44 |
+
"DataLocalityOpt": 0.15184760093688965,
|
| 45 |
+
"DataStreaming": 0.030707597732543945,
|
| 46 |
+
"DeConcat": 0.0052378177642822266,
|
| 47 |
+
"DeadCodeElimination": 0.0020182132720947266,
|
| 48 |
+
"DeadStoreElimination": 0.007268428802490234,
|
| 49 |
+
"DelinearIndices": 0.006491422653198242,
|
| 50 |
+
"Delinearization": 0.00418853759765625,
|
| 51 |
+
"DelinearizeSPMD": 0.03150320053100586,
|
| 52 |
+
"DoNothing": 0.0004954338073730469,
|
| 53 |
+
"DramToDramTranspose": 0.028717756271362305,
|
| 54 |
+
"DumpGraphAndMetadata": 0.04632568359375,
|
| 55 |
+
"EliminateDivs": 0.0021729469299316406,
|
| 56 |
+
"ExpandBatchNorm": 0.0017549991607666016,
|
| 57 |
+
"ExpandISAMacro": 0.01276254653930664,
|
| 58 |
+
"FactorizeBlkDims": 0.07627987861633301,
|
| 59 |
+
"FactorizeThreadAxesInFreeDims": 0.0036237239837646484,
|
| 60 |
+
"FlattenMacroLoop": 0.012475728988647461,
|
| 61 |
+
"GenericAccessSimplifier": 0.0007128715515136719,
|
| 62 |
+
"HoistCompute": 9.999999747378752e-06,
|
| 63 |
+
"IdentifyCrossPassTensors": 3.899999865097925e-05,
|
| 64 |
+
"InferInitValue": 0.11746096611022949,
|
| 65 |
+
"InferIntrinsicOnCC": 0.008626222610473633,
|
| 66 |
+
"InferNeuronTensor": 0.17520785331726074,
|
| 67 |
+
"InferNonlocalTensors": 0.02865004539489746,
|
| 68 |
+
"InferPSumTensor": 0.097686767578125,
|
| 69 |
+
"InferShardAxis": 0.2832298278808594,
|
| 70 |
+
"InferSharedMemLoc": 0.024610280990600586,
|
| 71 |
+
"InlineNativeKernels": 0.0025413036346435547,
|
| 72 |
+
"InsertCoreBarrier": 0.014633417129516602,
|
| 73 |
+
"InsertIOTransposes": 0.058136701583862305,
|
| 74 |
+
"InsertImplicitShardAxisBeforeISel": 0.024377822875976563,
|
| 75 |
+
"InsertLocalTransposes": 0.016265153884887695,
|
| 76 |
+
"InsertOffloadedTransposes": 0.03376030921936035,
|
| 77 |
+
"LICM": 0.015621185302734375,
|
| 78 |
+
"LateLegalizeInst": 0.037809133529663086,
|
| 79 |
+
"LateLegalizePostSplit": 0.01734447479248047,
|
| 80 |
+
"LateLowerReshapeOp": 0.0016047954559326172,
|
| 81 |
+
"LateLowerTensorOp": 0.0011878013610839844,
|
| 82 |
+
"LateNeuronInstComb": 0.07452130317687988,
|
| 83 |
+
"LayoutPreprocessing": 0.05620622634887695,
|
| 84 |
+
"LayoutPreprocessingAndAnalysis": 0.18100428581237793,
|
| 85 |
+
"LayoutRequirementAnalysis": 0.014584064483642578,
|
| 86 |
+
"LegalizeCCOpLayout": 0.0032541751861572266,
|
| 87 |
+
"LegalizeOpLevelAlias": 0.0010030269622802734,
|
| 88 |
+
"LegalizePartitionReduce": 0.002452373504638672,
|
| 89 |
+
"LegalizeSundaAccess": 0.07152366638183594,
|
| 90 |
+
"LegalizeSundaMacro": 0.0427708625793457,
|
| 91 |
+
"LegalizeType": 0.03647494316101074,
|
| 92 |
+
"LocalLayoutOpt": 0.014898538589477539,
|
| 93 |
+
"LoopFusion": 0.005176067352294922,
|
| 94 |
+
"LoopSplitting": 0.00048732757568359375,
|
| 95 |
+
"LowerBroadcast": 0.019514799118041992,
|
| 96 |
+
"LowerCCOpBlockAxis": 0.004888296127319336,
|
| 97 |
+
"LowerComplexBroadcast": 0.010831594467163086,
|
| 98 |
+
"LowerIntrinsics": 0.05155062675476074,
|
| 99 |
+
"LowerShardAxis": 0.017355918884277344,
|
| 100 |
+
"LowerTensorOp": 0.013428449630737305,
|
| 101 |
+
"LowerToSendRecv": 0.038613319396972656,
|
| 102 |
+
"LowerTranspose": 0.058027029037475586,
|
| 103 |
+
"MacroGeneration": 0.1058506965637207,
|
| 104 |
+
"MaskPropagation": 0.004538536071777344,
|
| 105 |
+
"MemcastMotion": 2.2000000171829015e-05,
|
| 106 |
+
"MemcpyElimination": 0.04629826545715332,
|
| 107 |
+
"MutateDataType": 0.0012559890747070313,
|
| 108 |
+
"NeuronAliasDependencyInduction": 0.0006165504455566406,
|
| 109 |
+
"NeuronAliasDependencyReset": 0.03877615928649902,
|
| 110 |
+
"NeuronInstComb": 0.05556750297546387,
|
| 111 |
+
"NeuronLICM": 0.04741477966308594,
|
| 112 |
+
"NeuronLoopFusion": 0.08438324928283691,
|
| 113 |
+
"NeuronLoopInterchange": 0.0028100013732910156,
|
| 114 |
+
"NeuronSimplifier": 0.0370326042175293,
|
| 115 |
+
"NeuronSimplifyPredicates": 0.029002904891967773,
|
| 116 |
+
"NeuronValueNumbering": 0.014310836791992188,
|
| 117 |
+
"OptimizeAliasedCopyChain": 0.0005040168762207031,
|
| 118 |
+
"OptimizeNKIKernels": 4.637849807739258,
|
| 119 |
+
"PAGLayoutOpt": 0.15427088737487793,
|
| 120 |
+
"PComputeCutting": 0.022019147872924805,
|
| 121 |
+
"PGLayoutTilingPipeline": 1.5585658550262451,
|
| 122 |
+
"PGTiling": 0.3059046268463135,
|
| 123 |
+
"PadElimination": 0.00058746337890625,
|
| 124 |
+
"ParAxesAnnotation": 0.07737350463867188,
|
| 125 |
+
"PartialLoopFusion": 0.03046131134033203,
|
| 126 |
+
"PartialSimdFusion": 0.008630514144897461,
|
| 127 |
+
"PenguinizeFunctions": 3.699999797390774e-05,
|
| 128 |
+
"PerfectLoopNest": 0.0037374496459960938,
|
| 129 |
+
"PruneFunctions": 4.600000102072954e-05,
|
| 130 |
+
"RecognizeOpIdiom": 0.0049936771392822266,
|
| 131 |
+
"Recompute": 0.0004494190216064453,
|
| 132 |
+
"RelaxPredicates": 0.00769495964050293,
|
| 133 |
+
"Rematerialization": 0.0034401416778564453,
|
| 134 |
+
"RemoveOptimizationBarriers": 4.8000001697801054e-05,
|
| 135 |
+
"RemoveShardedPartitionAxes": 0.008293628692626953,
|
| 136 |
+
"ReshapeWeights": 0.004475116729736328,
|
| 137 |
+
"ResolveAccessConflict": 0.0053598880767822266,
|
| 138 |
+
"ResolveComplicatePredicates": 0.0009164810180664063,
|
| 139 |
+
"RewriteReplicationMatmul": 0.00577545166015625,
|
| 140 |
+
"RewriteWeights": 0.010277271270751953,
|
| 141 |
+
"SFKVectorizer": 0.2676401138305664,
|
| 142 |
+
"ScatterMotion": 3.199999991920777e-05,
|
| 143 |
+
"ShardingPropagationAnalysis": 0.06793785095214844,
|
| 144 |
+
"SimpleAllReduceTiling": 0.011077165603637695,
|
| 145 |
+
"Simplifier": 0.0029976367950439453,
|
| 146 |
+
"SimplifyMacroPredicates": 0.025454998016357422,
|
| 147 |
+
"SimplifyNeuronTensor": 0.13071107864379883,
|
| 148 |
+
"SimplifySlice": 0.0008246898651123047,
|
| 149 |
+
"SimplifyTensor": 0.03260469436645508,
|
| 150 |
+
"SpillPSum": 0.0713953971862793,
|
| 151 |
+
"SplitAPUnionSets": 0.08632850646972656,
|
| 152 |
+
"SplitAccGrp": 0.002518892288208008,
|
| 153 |
+
"StaticProfiler": 0.026699542999267578,
|
| 154 |
+
"StaticTransposeLocalTensor": 0.009710550308227539,
|
| 155 |
+
"SundaISel": 0.08615612983703613,
|
| 156 |
+
"TCTransform": 0.0014863014221191406,
|
| 157 |
+
"TensorInitialization": 0.017354965209960938,
|
| 158 |
+
"TensorOpSimplifier": 0.004897356033325195,
|
| 159 |
+
"TensorOpTransform": 0.026237010955810547,
|
| 160 |
+
"TensorizerLegalizationPass": 4.099999932805076e-05,
|
| 161 |
+
"TileCCOps": 0.007733821868896484,
|
| 162 |
+
"TilingProfiler": 0.03455352783203125,
|
| 163 |
+
"TransformConvOp": 0.0042724609375,
|
| 164 |
+
"TritiumFusion": 0.11825895309448242,
|
| 165 |
+
"ValueNumbering": 0.0019876956939697266,
|
| 166 |
+
"VectorizeDMA": 0.03213214874267578,
|
| 167 |
+
"VectorizeMatMult": 0.010382413864135742,
|
| 168 |
+
"VerifySupportedOps": 3.300000025774352e-05,
|
| 169 |
+
"WeightCoalescing": 0.010597944259643555,
|
| 170 |
+
"ZeroSizeTensorElimination": 0.00017881393432617188,
|
| 171 |
+
"algsimp": 0.0017300000181421638,
|
| 172 |
+
"batchnorm_expander": 3.5000000934815034e-05,
|
| 173 |
+
"boundary-marker-removal": 1.2000000424450263e-05,
|
| 174 |
+
"call-inliner": 0.00022000000171829015,
|
| 175 |
+
"canonicalize-boundary-marker": 2.2000000171829015e-05,
|
| 176 |
+
"collective-stream-id-checker": 6.299999949987978e-05,
|
| 177 |
+
"comparison-expander": 0.0005039999959990382,
|
| 178 |
+
"computation-deduplicator": 5.8999998145736754e-05,
|
| 179 |
+
"config-lowering": 9.800000407267362e-05,
|
| 180 |
+
"constant-statistics": 0.0004199999966658652,
|
| 181 |
+
"constant_folding": 0.00015699998766649514,
|
| 182 |
+
"cse": 3.699999797390774e-05,
|
| 183 |
+
"dce": 4.099999932805076e-05,
|
| 184 |
+
"dot_decomposer": 0.0009689999860711396,
|
| 185 |
+
"dynamic-slice-transpose": 1.4999999621068127e-05,
|
| 186 |
+
"eliminate-redundant-compare": 0.00013899999612476677,
|
| 187 |
+
"emit-offloaded-dropout": 3.900000228895806e-05,
|
| 188 |
+
"flatten-call-graph": 0.0006180000491440296,
|
| 189 |
+
"fuse-send-recv": 5.7999997807201e-05,
|
| 190 |
+
"hilo-conditional-to-select": 1.2999999853491317e-05,
|
| 191 |
+
"hilo::LegalizeAlias": 1.1000000085914508e-05,
|
| 192 |
+
"hilo::NeuronInstCombine": 0.0001770000089891255,
|
| 193 |
+
"hilo::NeuronOpFusion": 3.7999998312443495e-05,
|
| 194 |
+
"hilo::ReplaceTokenTypeWithU8Pass": 4.199999966658652e-05,
|
| 195 |
+
"hilo::ScheduleFusion": 3.999999989900971e-06,
|
| 196 |
+
"hilo::SixtyFourHack": 6.199999916134402e-05,
|
| 197 |
+
"hilo::VerifyAliasing": 4.999999873689376e-06,
|
| 198 |
+
"hlo-mac-count": 0.011359000578522682,
|
| 199 |
+
"instruction-histogram": 0.0004990000161342323,
|
| 200 |
+
"io-con-pipe-begin": 3.999999989900971e-06,
|
| 201 |
+
"io-con-pipe-end": 9.999999974752427e-07,
|
| 202 |
+
"io-layout-normalization": 0.0007779999868944287,
|
| 203 |
+
"io-statistics": 4.099999932805076e-05,
|
| 204 |
+
"legalize-ccops-for-tensorizer": 3.999999989900971e-06,
|
| 205 |
+
"legalize-compare": 1.1000000085914508e-05,
|
| 206 |
+
"lower-argminmax-custom-call": 9.999999747378752e-06,
|
| 207 |
+
"map-inline": 0.0007570000016130507,
|
| 208 |
+
"metadata-naming": 4.8000001697801054e-05,
|
| 209 |
+
"mlir::detail::OpToOpPassAdaptor": 6.500000017695129e-05,
|
| 210 |
+
"mlir::hlo::MhloToPyPenguin": 0.006823000032454729,
|
| 211 |
+
"mlir::mhlo::LowerComplexExtraPass": 0.00024300000222865492,
|
| 212 |
+
"mlir::mhlo::LowerComplexPass": 0.0003090000245720148,
|
| 213 |
+
"native-to-custom-softmax": 0.00030399998649954796,
|
| 214 |
+
"native-to-custom-softmax-dx": 0.0016090000281110406,
|
| 215 |
+
"neuron-hlo-verifier": 0.010127999819815159,
|
| 216 |
+
"operand_upcaster": 4.199999966658652e-05,
|
| 217 |
+
"opt-barrier-removal": 0.00026199998683296144,
|
| 218 |
+
"post-par-pipe-begin": 0.00030399998649954796,
|
| 219 |
+
"post-par-pipe-end": 0.0,
|
| 220 |
+
"post-partition-simplification": 0.0014479999663308263,
|
| 221 |
+
"pre-par-pipe-begin": 9.999999974752427e-07,
|
| 222 |
+
"pre-par-pipe-end": 0.0,
|
| 223 |
+
"pre-partition-simplification": 0.05613299831748009,
|
| 224 |
+
"replace-minimum-constant": 0.00029700002050958574,
|
| 225 |
+
"reshape-mover": 5.60000044060871e-05,
|
| 226 |
+
"simplify-concat": 0.00010799999290611595,
|
| 227 |
+
"simplify-while-loops": 5.0000002374872565e-05,
|
| 228 |
+
"transform-variadic-reduce": 6.299999949987978e-05,
|
| 229 |
+
"tuple-simplifier": 0.00014699998428113759,
|
| 230 |
+
"unpack-nested-aws-ntwsr": 0.00021999998716637492,
|
| 231 |
+
"unroll-while-loop": 7.000000096013537e-06,
|
| 232 |
+
"zero_sized_hlo_elimination": 0.0007450000266544521
|
| 233 |
+
},
|
| 234 |
+
"hilo": {
|
| 235 |
+
"ConstantSize": 467583.0,
|
| 236 |
+
"HloInputCount": 371.0,
|
| 237 |
+
"HloMacCount": 13175750656.0,
|
| 238 |
+
"HloOutputCount": 57.0,
|
| 239 |
+
"IfmapSize": 3910914048.0,
|
| 240 |
+
"OfmapSize": 1879048192.0,
|
| 241 |
+
"OutputsReadFromCount": 0.0,
|
| 242 |
+
"PassthroughTensorsCount": 0.0,
|
| 243 |
+
"RedundantOutputCount": 0.0,
|
| 244 |
+
"Traffic": 871990400.0
|
| 245 |
+
},
|
| 246 |
+
"tensorizer": {
|
| 247 |
+
"DMATilingProfiler::TotalInstructionsAfterTiling": 20919.0,
|
| 248 |
+
"StaticProfiler::AifUb": 147.03309631347656,
|
| 249 |
+
"StaticProfiler::ArithmeticIntensityTensorizer": 141.05162048339844,
|
| 250 |
+
"StaticProfiler::AverageDmaLength": 2425.82958984375,
|
| 251 |
+
"StaticProfiler::DDRTransferBytes": 365941792.0,
|
| 252 |
+
"StaticProfiler::InternalTransferBytes": 325506848.0,
|
| 253 |
+
"StaticProfiler::LoadExpanded": 84060.0,
|
| 254 |
+
"StaticProfiler::StoreExpanded": 1898.0,
|
| 255 |
+
"StaticProfiler::TotalDMAExpanded": 85958.0,
|
| 256 |
+
"StaticProfiler::TotalDynamicInstancesCount": 25383.0,
|
| 257 |
+
"StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 24932.0,
|
| 258 |
+
"StaticProfiler::TotalLNCComm": 0.0,
|
| 259 |
+
"StaticProfiler::TotalLNCCommTransfer": 0.0,
|
| 260 |
+
"TilingProfiler::BatchnormInstructionsAfterTiling": 0.0,
|
| 261 |
+
"TilingProfiler::DmaInstructionsAfterTiling": 0.0,
|
| 262 |
+
"TilingProfiler::GenericInstructionsAfterTiling": 4.0,
|
| 263 |
+
"TilingProfiler::MatMultInstructionsAfterTiling": 10464.0,
|
| 264 |
+
"TilingProfiler::NumPfTransposes": 6.0,
|
| 265 |
+
"TilingProfiler::NumPfTransposesForIo": 1.0,
|
| 266 |
+
"TilingProfiler::NumPfTransposesForLocal": 1.0,
|
| 267 |
+
"TilingProfiler::NumPfTransposesForNonlocal": 4.0,
|
| 268 |
+
"TilingProfiler::PfTransposeInstructions": 10195.0,
|
| 269 |
+
"TilingProfiler::PfTransposeInstructionsForIo": 9504.0,
|
| 270 |
+
"TilingProfiler::PfTransposeInstructionsForLocal": 1.0,
|
| 271 |
+
"TilingProfiler::PfTransposeInstructionsForNonlocal": 690.0,
|
| 272 |
+
"TilingProfiler::ReduceInstructionsAfterTiling": 4.0,
|
| 273 |
+
"TilingProfiler::SimdInstructionsAfterTiling": 92.0,
|
| 274 |
+
"TilingProfiler::TotalInstructionsAfterTiling": 0.0,
|
| 275 |
+
"TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0,
|
| 276 |
+
"TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0,
|
| 277 |
+
"TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0,
|
| 278 |
+
"TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0,
|
| 279 |
+
"TransformConvOp::conv2d_column_packing": 0.0,
|
| 280 |
+
"TransformConvOp::conv2d_column_packing_1": 0.0,
|
| 281 |
+
"TransformConvOp::conv2d_column_packing_io10": 0.0,
|
| 282 |
+
"TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0
|
| 283 |
+
}
|
| 284 |
+
},
|
| 285 |
+
"all": {
|
| 286 |
+
"compiletime": {
|
| 287 |
+
"algsimp": 0.0015739999944344163,
|
| 288 |
+
"call-inliner": 0.00019500000053085387,
|
| 289 |
+
"collective-stream-id-checker": 5.400000009103678e-05,
|
| 290 |
+
"comparison-expander": 0.0004710000066552311,
|
| 291 |
+
"constant-statistics": 0.0004199999966658652,
|
| 292 |
+
"constant_folding": 0.0001320000010309741,
|
| 293 |
+
"dce": 3.7999998312443495e-05,
|
| 294 |
+
"dot_decomposer": 0.0009689999860711396,
|
| 295 |
+
"eliminate-redundant-compare": 0.00011899999663000926,
|
| 296 |
+
"flatten-call-graph": 0.0005910000181756914,
|
| 297 |
+
"hlo-mac-count": 0.006432000081986189,
|
| 298 |
+
"instruction-histogram": 0.0004990000161342323,
|
| 299 |
+
"io-con-pipe-begin": 3.999999989900971e-06,
|
| 300 |
+
"io-con-pipe-end": 9.999999974752427e-07,
|
| 301 |
+
"io-layout-normalization": 0.0007779999868944287,
|
| 302 |
+
"io-statistics": 4.099999932805076e-05,
|
| 303 |
+
"map-inline": 0.0007220000261440873,
|
| 304 |
+
"native-to-custom-softmax": 0.00028899998869746923,
|
| 305 |
+
"native-to-custom-softmax-dx": 0.00046099998871795833,
|
| 306 |
+
"neuron-hlo-verifier": 0.0090549997985363,
|
| 307 |
+
"opt-barrier-removal": 0.00026199998683296144,
|
| 308 |
+
"pre-par-pipe-begin": 9.999999974752427e-07,
|
| 309 |
+
"pre-par-pipe-end": 0.0,
|
| 310 |
+
"pre-partition-simplification": 0.05613299831748009,
|
| 311 |
+
"replace-minimum-constant": 0.0002770000137388706,
|
| 312 |
+
"reshape-mover": 4.70000013592653e-05,
|
| 313 |
+
"simplify-while-loops": 4.3000000005122274e-05,
|
| 314 |
+
"tuple-simplifier": 0.00013299999409355223,
|
| 315 |
+
"unpack-nested-aws-ntwsr": 0.00020799999765586108,
|
| 316 |
+
"unroll-while-loop": 7.000000096013537e-06,
|
| 317 |
+
"zero_sized_hlo_elimination": 0.0007450000266544521
|
| 318 |
+
}
|
| 319 |
+
},
|
| 320 |
+
"attention_isa_kernel": {
|
| 321 |
+
"compiletime": {
|
| 322 |
+
"CoalesceCCOp": 0.00019693374633789063,
|
| 323 |
+
"DMALocalityOpt": 0.00016736984252929688,
|
| 324 |
+
"DMAProfiler": 0.00026297569274902344,
|
| 325 |
+
"DataStreaming": 0.0002357959747314453,
|
| 326 |
+
"DoNothing": 0.004472255706787109,
|
| 327 |
+
"ExpandISAMacro": 0.00024008750915527344,
|
| 328 |
+
"FactorizeBlkDims": 0.001956939697265625,
|
| 329 |
+
"InferPSumTensor": 0.0005483627319335938,
|
| 330 |
+
"InferSharedMemLoc": 0.0012214183807373047,
|
| 331 |
+
"InsertCoreBarrier": 0.000339508056640625,
|
| 332 |
+
"LateLegalizeInst": 0.00020360946655273438,
|
| 333 |
+
"LateNeuronInstComb": 0.002096414566040039,
|
| 334 |
+
"LegalizeSundaAccess": 0.00022792816162109375,
|
| 335 |
+
"LegalizeType": 0.00030231475830078125,
|
| 336 |
+
"LowerBroadcast": 0.0002613067626953125,
|
| 337 |
+
"LowerIntrinsics": 0.0003268718719482422,
|
| 338 |
+
"LowerTranspose": 0.0002701282501220703,
|
| 339 |
+
"NeuronInstComb": 0.000457763671875,
|
| 340 |
+
"NeuronLICM": 0.0002644062042236328,
|
| 341 |
+
"NeuronSimplifyPredicates": 0.0002472400665283203,
|
| 342 |
+
"NeuronValueNumbering": 0.00029158592224121094,
|
| 343 |
+
"SFKVectorizer": 0.002269744873046875,
|
| 344 |
+
"SimpleAllReduceTiling": 0.00020956993103027344,
|
| 345 |
+
"SimplifyNeuronTensor": 0.0006353855133056641,
|
| 346 |
+
"SpillPSum": 0.0006325244903564453,
|
| 347 |
+
"WeightCoalescing": 0.00021409988403320313
|
| 348 |
+
}
|
| 349 |
+
},
|
| 350 |
+
"cumsum": {
|
| 351 |
+
"compiletime": {
|
| 352 |
+
"CoalesceCCOp": 0.00030303001403808594,
|
| 353 |
+
"DMALocalityOpt": 0.00025963783264160156,
|
| 354 |
+
"DMAProfiler": 0.0011391639709472656,
|
| 355 |
+
"DataStreaming": 0.0004107952117919922,
|
| 356 |
+
"DoNothing": 0.00016951560974121094,
|
| 357 |
+
"ExpandISAMacro": 0.0008628368377685547,
|
| 358 |
+
"FactorizeBlkDims": 0.0031676292419433594,
|
| 359 |
+
"InferPSumTensor": 0.0011391639709472656,
|
| 360 |
+
"InferSharedMemLoc": 0.0004911422729492188,
|
| 361 |
+
"InsertCoreBarrier": 0.0014476776123046875,
|
| 362 |
+
"LateLegalizeInst": 0.0051555633544921875,
|
| 363 |
+
"LateNeuronInstComb": 0.0011050701141357422,
|
| 364 |
+
"LegalizeSundaAccess": 0.0025599002838134766,
|
| 365 |
+
"LegalizeType": 0.0004215240478515625,
|
| 366 |
+
"LowerBroadcast": 0.0014843940734863281,
|
| 367 |
+
"LowerIntrinsics": 0.0016138553619384766,
|
| 368 |
+
"LowerTranspose": 0.00037097930908203125,
|
| 369 |
+
"NeuronInstComb": 0.0021207332611083984,
|
| 370 |
+
"NeuronLICM": 0.0007026195526123047,
|
| 371 |
+
"NeuronSimplifyPredicates": 0.004625082015991211,
|
| 372 |
+
"NeuronValueNumbering": 0.0007369518280029297,
|
| 373 |
+
"SFKVectorizer": 0.005678415298461914,
|
| 374 |
+
"SimpleAllReduceTiling": 0.0004096031188964844,
|
| 375 |
+
"SimplifyNeuronTensor": 0.0030858516693115234,
|
| 376 |
+
"SpillPSum": 0.0021026134490966797,
|
| 377 |
+
"WeightCoalescing": 0.0003502368927001953
|
| 378 |
+
}
|
| 379 |
+
},
|
| 380 |
+
"sg00": {
|
| 381 |
+
"compiletime": {
|
| 382 |
+
"CanonicalizeConv": 2.499999936844688e-05,
|
| 383 |
+
"CanonicalizeForTensorizer": 1.2999999853491317e-05,
|
| 384 |
+
"Canonicalizer": 0.00033400001120753586,
|
| 385 |
+
"HoistCompute": 3.000000106112566e-06,
|
| 386 |
+
"IdentifyCrossPassTensors": 1.4999999621068127e-05,
|
| 387 |
+
"MemcastMotion": 1.1000000085914508e-05,
|
| 388 |
+
"PenguinizeFunctions": 1.4999999621068127e-05,
|
| 389 |
+
"PruneFunctions": 1.4000000192027073e-05,
|
| 390 |
+
"RemoveOptimizationBarriers": 2.099999983329326e-05,
|
| 391 |
+
"ScatterMotion": 2.9999999242136255e-05,
|
| 392 |
+
"TensorizerLegalizationPass": 1.8999999156221747e-05,
|
| 393 |
+
"VerifySupportedOps": 1.1000000085914508e-05,
|
| 394 |
+
"algsimp": 5.8000001445179805e-05,
|
| 395 |
+
"batchnorm_expander": 1.1000000085914508e-05,
|
| 396 |
+
"boundary-marker-removal": 3.999999989900971e-06,
|
| 397 |
+
"call-inliner": 7.999999979801942e-06,
|
| 398 |
+
"canonicalize-boundary-marker": 1.2000000424450263e-05,
|
| 399 |
+
"collective-stream-id-checker": 3.000000106112566e-06,
|
| 400 |
+
"comparison-expander": 4.999999873689376e-06,
|
| 401 |
+
"computation-deduplicator": 1.8000000636675395e-05,
|
| 402 |
+
"config-lowering": 2.9999999242136255e-05,
|
| 403 |
+
"constant_folding": 9.000000318337698e-06,
|
| 404 |
+
"cse": 1.4000000192027073e-05,
|
| 405 |
+
"dce": 9.999999974752427e-07,
|
| 406 |
+
"dynamic-slice-transpose": 4.999999873689376e-06,
|
| 407 |
+
"eliminate-redundant-compare": 3.999999989900971e-06,
|
| 408 |
+
"emit-offloaded-dropout": 1.4000000192027073e-05,
|
| 409 |
+
"flatten-call-graph": 9.000000318337698e-06,
|
| 410 |
+
"fuse-send-recv": 2.099999983329326e-05,
|
| 411 |
+
"hilo-conditional-to-select": 3.000000106112566e-06,
|
| 412 |
+
"hilo::LegalizeAlias": 4.999999873689376e-06,
|
| 413 |
+
"hilo::NeuronInstCombine": 5.700000110664405e-05,
|
| 414 |
+
"hilo::NeuronOpFusion": 4.999999873689376e-06,
|
| 415 |
+
"hilo::ReplaceTokenTypeWithU8Pass": 1.1000000085914508e-05,
|
| 416 |
+
"hilo::ScheduleFusion": 9.999999974752427e-07,
|
| 417 |
+
"hilo::SixtyFourHack": 9.999999747378752e-06,
|
| 418 |
+
"hilo::VerifyAliasing": 1.9999999949504854e-06,
|
| 419 |
+
"hlo-mac-count": 7.899999764049426e-05,
|
| 420 |
+
"legalize-ccops-for-tensorizer": 1.9999999949504854e-06,
|
| 421 |
+
"legalize-compare": 3.999999989900971e-06,
|
| 422 |
+
"lower-argminmax-custom-call": 3.000000106112566e-06,
|
| 423 |
+
"map-inline": 1.2000000424450263e-05,
|
| 424 |
+
"metadata-naming": 1.4999999621068127e-05,
|
| 425 |
+
"mlir::detail::OpToOpPassAdaptor": 2.2000000171829015e-05,
|
| 426 |
+
"mlir::hlo::MhloToPyPenguin": 0.0009059999720193446,
|
| 427 |
+
"mlir::mhlo::LowerComplexExtraPass": 9.600000339560211e-05,
|
| 428 |
+
"mlir::mhlo::LowerComplexPass": 0.00018000000272877514,
|
| 429 |
+
"native-to-custom-softmax": 4.999999873689376e-06,
|
| 430 |
+
"native-to-custom-softmax-dx": 0.0011220000451430678,
|
| 431 |
+
"neuron-hlo-verifier": 0.00035700001171790063,
|
| 432 |
+
"operand_upcaster": 1.8000000636675395e-05,
|
| 433 |
+
"post-par-pipe-begin": 0.0003020000003743917,
|
| 434 |
+
"post-par-pipe-end": 0.0,
|
| 435 |
+
"post-partition-simplification": 0.0005360000068321824,
|
| 436 |
+
"replace-minimum-constant": 7.000000096013537e-06,
|
| 437 |
+
"reshape-mover": 3.000000106112566e-06,
|
| 438 |
+
"simplify-concat": 3.400000059627928e-05,
|
| 439 |
+
"simplify-while-loops": 3.000000106112566e-06,
|
| 440 |
+
"transform-variadic-reduce": 7.999999979801942e-06,
|
| 441 |
+
"tuple-simplifier": 4.999999873689376e-06,
|
| 442 |
+
"unpack-nested-aws-ntwsr": 3.999999989900971e-06,
|
| 443 |
+
"unroll-while-loop": 0.0
|
| 444 |
+
},
|
| 445 |
+
"hilo": {
|
| 446 |
+
"ArithmeticIntensity": 8.479304313659668,
|
| 447 |
+
"ConstantSize": 467583.0,
|
| 448 |
+
"HloInputCount": 371.0,
|
| 449 |
+
"HloMacCount": 1677721600.0,
|
| 450 |
+
"HloOutputCount": 57.0,
|
| 451 |
+
"IfmapSize": 3910914048.0,
|
| 452 |
+
"OfmapSize": 1879048192.0,
|
| 453 |
+
"OutputsReadFromCount": 0.0,
|
| 454 |
+
"PassthroughTensorsCount": 0.0,
|
| 455 |
+
"RedundantOutputCount": 0.0,
|
| 456 |
+
"Traffic": 395721504.0
|
| 457 |
+
}
|
| 458 |
+
},
|
| 459 |
+
"sg0000": {
|
| 460 |
+
"compiletime": {
|
| 461 |
+
"AGOrderingAnalysisPass": 0.05208444595336914,
|
| 462 |
+
"AffinePredicateResolution": 0.002167940139770508,
|
| 463 |
+
"AliasDependencyElimination": 0.00020503997802734375,
|
| 464 |
+
"AliasDependencyInduction": 0.006783246994018555,
|
| 465 |
+
"AliasDependencyReset": 0.20125508308410645,
|
| 466 |
+
"BFComputeCutting": 0.007937908172607422,
|
| 467 |
+
"BirCodeGenLoop": 0.10184049606323242,
|
| 468 |
+
"CCOpFusion": 0.03359842300415039,
|
| 469 |
+
"CanonicalizeDAGForPGTiling": 0.003628253936767578,
|
| 470 |
+
"CanonicalizeIR": 0.0030901432037353516,
|
| 471 |
+
"CoalesceCCOp": 0.017004013061523438,
|
| 472 |
+
"CommuteConcat": 0.0019147396087646484,
|
| 473 |
+
"DMALocalityOpt": 0.008169889450073242,
|
| 474 |
+
"DMAProfiler": 0.019730091094970703,
|
| 475 |
+
"DMATilingProfiler": 0.01212453842163086,
|
| 476 |
+
"DataLocalityOpt": 0.20879435539245605,
|
| 477 |
+
"DataStreaming": 0.017726421356201172,
|
| 478 |
+
"DeConcat": 0.0039784908294677734,
|
| 479 |
+
"DeadCodeElimination": 0.0020265579223632813,
|
| 480 |
+
"DeadStoreElimination": 0.023813247680664063,
|
| 481 |
+
"DelinearIndices": 0.020769357681274414,
|
| 482 |
+
"Delinearization": 0.03343391418457031,
|
| 483 |
+
"DelinearizeSPMD": 0.0467836856842041,
|
| 484 |
+
"DoNothing": 8.96453857421875e-05,
|
| 485 |
+
"DramToDramTranspose": 0.029311418533325195,
|
| 486 |
+
"DumpGraphAndMetadata": 0.008599281311035156,
|
| 487 |
+
"EliminateDivs": 0.003629446029663086,
|
| 488 |
+
"ExpandBatchNorm": 0.0015780925750732422,
|
| 489 |
+
"ExpandISAMacro": 0.006983280181884766,
|
| 490 |
+
"FactorizeBlkDims": 0.02126312255859375,
|
| 491 |
+
"FactorizeThreadAxesInFreeDims": 0.003243684768676758,
|
| 492 |
+
"FlattenMacroLoop": 0.0065686702728271484,
|
| 493 |
+
"GenericAccessSimplifier": 0.001466512680053711,
|
| 494 |
+
"InferInitValue": 0.04482269287109375,
|
| 495 |
+
"InferIntrinsicOnCC": 0.01812601089477539,
|
| 496 |
+
"InferNeuronTensor": 0.10232234001159668,
|
| 497 |
+
"InferNonlocalTensors": 0.17829585075378418,
|
| 498 |
+
"InferPSumTensor": 0.08844804763793945,
|
| 499 |
+
"InferShardAxis": 0.7131092548370361,
|
| 500 |
+
"InferSharedMemLoc": 0.007193565368652344,
|
| 501 |
+
"InlineNativeKernels": 0.006009101867675781,
|
| 502 |
+
"InsertCoreBarrier": 0.015059709548950195,
|
| 503 |
+
"InsertIOTransposes": 0.07647299766540527,
|
| 504 |
+
"InsertImplicitShardAxisBeforeISel": 0.020087480545043945,
|
| 505 |
+
"InsertLocalTransposes": 0.037857770919799805,
|
| 506 |
+
"InsertOffloadedTransposes": 0.022881269454956055,
|
| 507 |
+
"LICM": 0.012552261352539063,
|
| 508 |
+
"LateLegalizeInst": 0.025588512420654297,
|
| 509 |
+
"LateLegalizePostSplit": 0.012372970581054688,
|
| 510 |
+
"LateLowerReshapeOp": 0.004400491714477539,
|
| 511 |
+
"LateLowerTensorOp": 0.004253387451171875,
|
| 512 |
+
"LateNeuronInstComb": 0.039977073669433594,
|
| 513 |
+
"LayoutPreprocessing": 0.06799173355102539,
|
| 514 |
+
"LayoutPreprocessingAndAnalysis": 0.1176137924194336,
|
| 515 |
+
"LayoutRequirementAnalysis": 0.01578998565673828,
|
| 516 |
+
"LegalizeCCOpLayout": 0.0030679702758789063,
|
| 517 |
+
"LegalizeOpLevelAlias": 0.0017116069793701172,
|
| 518 |
+
"LegalizePartitionReduce": 0.002843618392944336,
|
| 519 |
+
"LegalizeSundaAccess": 0.08243513107299805,
|
| 520 |
+
"LegalizeSundaMacro": 0.02523207664489746,
|
| 521 |
+
"LegalizeType": 0.014882326126098633,
|
| 522 |
+
"LocalLayoutOpt": 0.019226789474487305,
|
| 523 |
+
"LoopFusion": 0.007382631301879883,
|
| 524 |
+
"LoopSplitting": 0.0006470680236816406,
|
| 525 |
+
"LowerBroadcast": 0.005588054656982422,
|
| 526 |
+
"LowerCCOpBlockAxis": 0.0077972412109375,
|
| 527 |
+
"LowerComplexBroadcast": 0.005771636962890625,
|
| 528 |
+
"LowerIntrinsics": 0.06823062896728516,
|
| 529 |
+
"LowerShardAxis": 0.01669931411743164,
|
| 530 |
+
"LowerTensorOp": 0.028963327407836914,
|
| 531 |
+
"LowerToSendRecv": 0.003696441650390625,
|
| 532 |
+
"LowerTranspose": 0.022225618362426758,
|
| 533 |
+
"MacroGeneration": 0.0702672004699707,
|
| 534 |
+
"MaskPropagation": 0.010986804962158203,
|
| 535 |
+
"MemcpyElimination": 0.1031653881072998,
|
| 536 |
+
"MutateDataType": 0.0030710697174072266,
|
| 537 |
+
"NeuronAliasDependencyInduction": 0.0008504390716552734,
|
| 538 |
+
"NeuronAliasDependencyReset": 0.10823488235473633,
|
| 539 |
+
"NeuronInstComb": 0.032953739166259766,
|
| 540 |
+
"NeuronLICM": 0.018877506256103516,
|
| 541 |
+
"NeuronLoopFusion": 0.03511810302734375,
|
| 542 |
+
"NeuronLoopInterchange": 0.009130239486694336,
|
| 543 |
+
"NeuronSimplifier": 0.02072596549987793,
|
| 544 |
+
"NeuronSimplifyPredicates": 0.005728721618652344,
|
| 545 |
+
"NeuronValueNumbering": 0.017284870147705078,
|
| 546 |
+
"OptimizeAliasedCopyChain": 0.0006775856018066406,
|
| 547 |
+
"OptimizeNKIKernels": 0.5134098529815674,
|
| 548 |
+
"PAGLayoutOpt": 0.5583286285400391,
|
| 549 |
+
"PComputeCutting": 0.026990413665771484,
|
| 550 |
+
"PGLayoutTilingPipeline": 2.505728006362915,
|
| 551 |
+
"PGTiling": 0.4031352996826172,
|
| 552 |
+
"PadElimination": 0.0005686283111572266,
|
| 553 |
+
"ParAxesAnnotation": 0.48941731452941895,
|
| 554 |
+
"PartialLoopFusion": 0.03877878189086914,
|
| 555 |
+
"PartialSimdFusion": 0.05450034141540527,
|
| 556 |
+
"PerfectLoopNest": 0.006276607513427734,
|
| 557 |
+
"RecognizeOpIdiom": 0.006324291229248047,
|
| 558 |
+
"Recompute": 0.0004134178161621094,
|
| 559 |
+
"RelaxPredicates": 0.008553743362426758,
|
| 560 |
+
"Rematerialization": 0.012713193893432617,
|
| 561 |
+
"RemoveShardedPartitionAxes": 0.04062914848327637,
|
| 562 |
+
"ReshapeWeights": 0.0019867420196533203,
|
| 563 |
+
"ResolveAccessConflict": 0.006893634796142578,
|
| 564 |
+
"ResolveComplicatePredicates": 0.0020072460174560547,
|
| 565 |
+
"RewriteReplicationMatmul": 0.002567291259765625,
|
| 566 |
+
"RewriteWeights": 0.008040666580200195,
|
| 567 |
+
"SFKVectorizer": 0.35219240188598633,
|
| 568 |
+
"ShardingPropagationAnalysis": 0.03732752799987793,
|
| 569 |
+
"SimpleAllReduceTiling": 0.00998234748840332,
|
| 570 |
+
"Simplifier": 0.00720524787902832,
|
| 571 |
+
"SimplifyMacroPredicates": 0.008156061172485352,
|
| 572 |
+
"SimplifyNeuronTensor": 0.020155906677246094,
|
| 573 |
+
"SimplifySlice": 0.0016894340515136719,
|
| 574 |
+
"SimplifyTensor": 0.01220250129699707,
|
| 575 |
+
"SpillPSum": 0.03788638114929199,
|
| 576 |
+
"SplitAPUnionSets": 0.05510139465332031,
|
| 577 |
+
"SplitAccGrp": 0.006468534469604492,
|
| 578 |
+
"StaticProfiler": 0.017852783203125,
|
| 579 |
+
"StaticTransposeLocalTensor": 0.00736546516418457,
|
| 580 |
+
"SundaISel": 0.09026622772216797,
|
| 581 |
+
"TCTransform": 0.0017704963684082031,
|
| 582 |
+
"TensorInitialization": 0.010450363159179688,
|
| 583 |
+
"TensorOpSimplifier": 0.02020740509033203,
|
| 584 |
+
"TensorOpTransform": 0.027513504028320313,
|
| 585 |
+
"TileCCOps": 0.008568286895751953,
|
| 586 |
+
"TilingProfiler": 0.03838157653808594,
|
| 587 |
+
"TransformConvOp": 0.007506370544433594,
|
| 588 |
+
"TritiumFusion": 0.050549983978271484,
|
| 589 |
+
"ValueNumbering": 0.0038373470306396484,
|
| 590 |
+
"VectorizeDMA": 0.017205238342285156,
|
| 591 |
+
"VectorizeMatMult": 0.021669626235961914,
|
| 592 |
+
"WeightCoalescing": 0.004259347915649414,
|
| 593 |
+
"ZeroSizeTensorElimination": 0.00019121170043945313
|
| 594 |
+
},
|
| 595 |
+
"tensorizer": {
|
| 596 |
+
"DMATilingProfiler::TotalInstructionsAfterTiling": 847.0,
|
| 597 |
+
"StaticProfiler::AifUb": 8.478300094604492,
|
| 598 |
+
"StaticProfiler::ArithmeticIntensityTensorizer": 131.77493286132813,
|
| 599 |
+
"StaticProfiler::AverageDmaLength": 1355.7093505859375,
|
| 600 |
+
"StaticProfiler::AverageFractalPeUtilization": 99.68699645996094,
|
| 601 |
+
"StaticProfiler::AveragePartitionUtilization": 99.0614013671875,
|
| 602 |
+
"StaticProfiler::AveragePeUtilization": 99.3685073852539,
|
| 603 |
+
"StaticProfiler::DDRTransferBytes": 29617926.0,
|
| 604 |
+
"StaticProfiler::InternalTransferBytes": 11470848.0,
|
| 605 |
+
"StaticProfiler::LoadExpanded": 12422.0,
|
| 606 |
+
"StaticProfiler::LocalizationEfficiency": 1554.2613525390625,
|
| 607 |
+
"StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 1973.466064453125,
|
| 608 |
+
"StaticProfiler::StoreExpanded": 5889.0,
|
| 609 |
+
"StaticProfiler::TotalDMAExpanded": 18311.0,
|
| 610 |
+
"StaticProfiler::TotalDynamicInstancesCount": 1115.0,
|
| 611 |
+
"StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 1113.0,
|
| 612 |
+
"StaticProfiler::TotalLNCComm": 0.0,
|
| 613 |
+
"StaticProfiler::TotalLNCCommTransfer": 0.0,
|
| 614 |
+
"TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0,
|
| 615 |
+
"TilingProfiler::AveragePeUtilizationAfterTiling": 0.0,
|
| 616 |
+
"TilingProfiler::BatchnormInstructionsAfterTiling": 0.0,
|
| 617 |
+
"TilingProfiler::DmaInstructionsAfterTiling": 0.0,
|
| 618 |
+
"TilingProfiler::GenericInstructionsAfterTiling": 20.0,
|
| 619 |
+
"TilingProfiler::MatMultInstructionsAfterTiling": 514.0,
|
| 620 |
+
"TilingProfiler::NumPfTransposes": 6.0,
|
| 621 |
+
"TilingProfiler::NumPfTransposesForIo": 1.0,
|
| 622 |
+
"TilingProfiler::NumPfTransposesForLocal": 2.0,
|
| 623 |
+
"TilingProfiler::NumPfTransposesForNonlocal": 3.0,
|
| 624 |
+
"TilingProfiler::PfTransposeInstructions": 104.0,
|
| 625 |
+
"TilingProfiler::PfTransposeInstructionsForIo": 32.0,
|
| 626 |
+
"TilingProfiler::PfTransposeInstructionsForLocal": 24.0,
|
| 627 |
+
"TilingProfiler::PfTransposeInstructionsForNonlocal": 48.0,
|
| 628 |
+
"TilingProfiler::ReduceInstructionsAfterTiling": 0.0,
|
| 629 |
+
"TilingProfiler::SimdInstructionsAfterTiling": 86.0,
|
| 630 |
+
"TilingProfiler::TotalInstructionsAfterTiling": 0.0,
|
| 631 |
+
"TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0,
|
| 632 |
+
"TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0,
|
| 633 |
+
"TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0,
|
| 634 |
+
"TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0,
|
| 635 |
+
"TransformConvOp::conv2d_column_packing": 0.0,
|
| 636 |
+
"TransformConvOp::conv2d_column_packing_1": 0.0,
|
| 637 |
+
"TransformConvOp::conv2d_column_packing_io10": 0.0,
|
| 638 |
+
"TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0
|
| 639 |
+
}
|
| 640 |
+
},
|
| 641 |
+
"sg0001": {
|
| 642 |
+
"compiletime": {
|
| 643 |
+
"AGOrderingAnalysisPass": 0.10172820091247559,
|
| 644 |
+
"AffinePredicateResolution": 0.0019948482513427734,
|
| 645 |
+
"AliasDependencyElimination": 0.0002758502960205078,
|
| 646 |
+
"AliasDependencyInduction": 0.007086038589477539,
|
| 647 |
+
"AliasDependencyReset": 0.13438987731933594,
|
| 648 |
+
"BFComputeCutting": 0.0027761459350585938,
|
| 649 |
+
"BirCodeGenLoop": 0.05368757247924805,
|
| 650 |
+
"CCOpFusion": 0.03205680847167969,
|
| 651 |
+
"CanonicalizeDAGForPGTiling": 0.0052297115325927734,
|
| 652 |
+
"CanonicalizeIR": 0.002682924270629883,
|
| 653 |
+
"CoalesceCCOp": 0.008353471755981445,
|
| 654 |
+
"CommuteConcat": 0.0031654834747314453,
|
| 655 |
+
"DMALocalityOpt": 0.0032248497009277344,
|
| 656 |
+
"DMAProfiler": 0.006761789321899414,
|
| 657 |
+
"DMATilingProfiler": 0.00853872299194336,
|
| 658 |
+
"DataLocalityOpt": 0.366649866104126,
|
| 659 |
+
"DataStreaming": 0.008889198303222656,
|
| 660 |
+
"DeConcat": 0.002901792526245117,
|
| 661 |
+
"DeadCodeElimination": 0.016579151153564453,
|
| 662 |
+
"DeadStoreElimination": 0.029788732528686523,
|
| 663 |
+
"DelinearIndices": 0.019867897033691406,
|
| 664 |
+
"Delinearization": 0.0065822601318359375,
|
| 665 |
+
"DelinearizeSPMD": 0.023911237716674805,
|
| 666 |
+
"DoNothing": 7.867813110351563e-05,
|
| 667 |
+
"DramToDramTranspose": 0.026773452758789063,
|
| 668 |
+
"DumpGraphAndMetadata": 0.006331682205200195,
|
| 669 |
+
"EliminateDivs": 0.006492137908935547,
|
| 670 |
+
"ExpandBatchNorm": 0.0019371509552001953,
|
| 671 |
+
"ExpandISAMacro": 0.011901378631591797,
|
| 672 |
+
"FactorizeBlkDims": 0.03787398338317871,
|
| 673 |
+
"FactorizeThreadAxesInFreeDims": 0.0023696422576904297,
|
| 674 |
+
"FlattenMacroLoop": 0.006732463836669922,
|
| 675 |
+
"GenericAccessSimplifier": 0.0011754035949707031,
|
| 676 |
+
"InferInitValue": 0.07735943794250488,
|
| 677 |
+
"InferIntrinsicOnCC": 0.017465829849243164,
|
| 678 |
+
"InferNeuronTensor": 0.09335732460021973,
|
| 679 |
+
"InferNonlocalTensors": 0.029421567916870117,
|
| 680 |
+
"InferPSumTensor": 0.12906312942504883,
|
| 681 |
+
"InferShardAxis": 0.7434248924255371,
|
| 682 |
+
"InferSharedMemLoc": 0.005700111389160156,
|
| 683 |
+
"InlineNativeKernels": 0.002834320068359375,
|
| 684 |
+
"InsertCoreBarrier": 0.006781339645385742,
|
| 685 |
+
"InsertIOTransposes": 0.0841522216796875,
|
| 686 |
+
"InsertImplicitShardAxisBeforeISel": 0.012434244155883789,
|
| 687 |
+
"InsertLocalTransposes": 0.019251346588134766,
|
| 688 |
+
"InsertOffloadedTransposes": 0.028300762176513672,
|
| 689 |
+
"LICM": 0.005795001983642578,
|
| 690 |
+
"LateLegalizeInst": 0.011514902114868164,
|
| 691 |
+
"LateLegalizePostSplit": 0.005158185958862305,
|
| 692 |
+
"LateLowerReshapeOp": 0.0047490596771240234,
|
| 693 |
+
"LateLowerTensorOp": 0.004218101501464844,
|
| 694 |
+
"LateNeuronInstComb": 0.047844648361206055,
|
| 695 |
+
"LayoutPreprocessing": 0.03463029861450195,
|
| 696 |
+
"LayoutPreprocessingAndAnalysis": 0.06621217727661133,
|
| 697 |
+
"LayoutRequirementAnalysis": 0.007728099822998047,
|
| 698 |
+
"LegalizeCCOpLayout": 0.003231048583984375,
|
| 699 |
+
"LegalizeOpLevelAlias": 0.001981973648071289,
|
| 700 |
+
"LegalizePartitionReduce": 0.0027234554290771484,
|
| 701 |
+
"LegalizeSundaAccess": 0.04511404037475586,
|
| 702 |
+
"LegalizeSundaMacro": 0.022600412368774414,
|
| 703 |
+
"LegalizeType": 0.0190885066986084,
|
| 704 |
+
"LocalLayoutOpt": 0.04217672348022461,
|
| 705 |
+
"LoopFusion": 0.012153148651123047,
|
| 706 |
+
"LoopSplitting": 0.0006983280181884766,
|
| 707 |
+
"LowerBroadcast": 0.001943826675415039,
|
| 708 |
+
"LowerCCOpBlockAxis": 0.007781505584716797,
|
| 709 |
+
"LowerComplexBroadcast": 0.004039287567138672,
|
| 710 |
+
"LowerIntrinsics": 0.08824563026428223,
|
| 711 |
+
"LowerShardAxis": 0.008327722549438477,
|
| 712 |
+
"LowerTensorOp": 0.033898115158081055,
|
| 713 |
+
"LowerToSendRecv": 0.005768775939941406,
|
| 714 |
+
"LowerTranspose": 0.02297377586364746,
|
| 715 |
+
"MacroGeneration": 0.16904258728027344,
|
| 716 |
+
"MaskPropagation": 0.007157087326049805,
|
| 717 |
+
"MemcpyElimination": 0.08653593063354492,
|
| 718 |
+
"MutateDataType": 0.001874685287475586,
|
| 719 |
+
"NeuronAliasDependencyInduction": 0.0008199214935302734,
|
| 720 |
+
"NeuronAliasDependencyReset": 0.09268832206726074,
|
| 721 |
+
"NeuronInstComb": 0.013442754745483398,
|
| 722 |
+
"NeuronLICM": 0.04093337059020996,
|
| 723 |
+
"NeuronLoopFusion": 0.07855010032653809,
|
| 724 |
+
"NeuronLoopInterchange": 0.0029878616333007813,
|
| 725 |
+
"NeuronSimplifier": 0.013553857803344727,
|
| 726 |
+
"NeuronSimplifyPredicates": 0.0043621063232421875,
|
| 727 |
+
"NeuronValueNumbering": 0.011638164520263672,
|
| 728 |
+
"OptimizeAliasedCopyChain": 0.001085042953491211,
|
| 729 |
+
"OptimizeNKIKernels": 0.4002358913421631,
|
| 730 |
+
"PAGLayoutOpt": 0.5899946689605713,
|
| 731 |
+
"PComputeCutting": 0.011747598648071289,
|
| 732 |
+
"PGLayoutTilingPipeline": 2.3099381923675537,
|
| 733 |
+
"PGTiling": 0.39591336250305176,
|
| 734 |
+
"PadElimination": 0.0018284320831298828,
|
| 735 |
+
"ParAxesAnnotation": 0.5343668460845947,
|
| 736 |
+
"PartialLoopFusion": 0.0648810863494873,
|
| 737 |
+
"PartialSimdFusion": 0.06934404373168945,
|
| 738 |
+
"PerfectLoopNest": 0.010063648223876953,
|
| 739 |
+
"RecognizeOpIdiom": 0.006760358810424805,
|
| 740 |
+
"Recompute": 0.0004215240478515625,
|
| 741 |
+
"RelaxPredicates": 0.004682064056396484,
|
| 742 |
+
"Rematerialization": 0.0020973682403564453,
|
| 743 |
+
"RemoveShardedPartitionAxes": 0.03322100639343262,
|
| 744 |
+
"ReshapeWeights": 0.005750894546508789,
|
| 745 |
+
"ResolveAccessConflict": 0.005618572235107422,
|
| 746 |
+
"ResolveComplicatePredicates": 0.0011665821075439453,
|
| 747 |
+
"RewriteReplicationMatmul": 0.0025589466094970703,
|
| 748 |
+
"RewriteWeights": 0.010002374649047852,
|
| 749 |
+
"SFKVectorizer": 0.2708115577697754,
|
| 750 |
+
"ShardingPropagationAnalysis": 0.04528522491455078,
|
| 751 |
+
"SimpleAllReduceTiling": 0.003036975860595703,
|
| 752 |
+
"Simplifier": 0.004547834396362305,
|
| 753 |
+
"SimplifyMacroPredicates": 0.0300595760345459,
|
| 754 |
+
"SimplifyNeuronTensor": 0.014966249465942383,
|
| 755 |
+
"SimplifySlice": 0.01027679443359375,
|
| 756 |
+
"SimplifyTensor": 0.020308732986450195,
|
| 757 |
+
"SpillPSum": 0.04539823532104492,
|
| 758 |
+
"SplitAPUnionSets": 0.023496150970458984,
|
| 759 |
+
"SplitAccGrp": 0.0026144981384277344,
|
| 760 |
+
"StaticProfiler": 0.006074190139770508,
|
| 761 |
+
"StaticTransposeLocalTensor": 0.006592273712158203,
|
| 762 |
+
"SundaISel": 0.06954693794250488,
|
| 763 |
+
"TCTransform": 0.001828908920288086,
|
| 764 |
+
"TensorInitialization": 0.00876927375793457,
|
| 765 |
+
"TensorOpSimplifier": 0.011527299880981445,
|
| 766 |
+
"TensorOpTransform": 0.03972220420837402,
|
| 767 |
+
"TileCCOps": 0.00546574592590332,
|
| 768 |
+
"TilingProfiler": 0.02742171287536621,
|
| 769 |
+
"TransformConvOp": 0.006824016571044922,
|
| 770 |
+
"TritiumFusion": 0.11011958122253418,
|
| 771 |
+
"ValueNumbering": 0.004981040954589844,
|
| 772 |
+
"VectorizeDMA": 0.03582024574279785,
|
| 773 |
+
"VectorizeMatMult": 0.0291445255279541,
|
| 774 |
+
"WeightCoalescing": 0.008509397506713867,
|
| 775 |
+
"ZeroSizeTensorElimination": 0.00014853477478027344
|
| 776 |
+
},
|
| 777 |
+
"tensorizer": {
|
| 778 |
+
"DMATilingProfiler::TotalInstructionsAfterTiling": 1813.0,
|
| 779 |
+
"StaticProfiler::AifUb": 76.42292022705078,
|
| 780 |
+
"StaticProfiler::ArithmeticIntensityTensorizer": 227.36143493652344,
|
| 781 |
+
"StaticProfiler::AverageDmaLength": 4034.3251953125,
|
| 782 |
+
"StaticProfiler::AverageFractalPeUtilization": 100.0,
|
| 783 |
+
"StaticProfiler::AveragePartitionUtilization": 99.65364074707031,
|
| 784 |
+
"StaticProfiler::AveragePeUtilization": 100.0,
|
| 785 |
+
"StaticProfiler::DDRTransferBytes": 63514120.0,
|
| 786 |
+
"StaticProfiler::InternalTransferBytes": 13500416.0,
|
| 787 |
+
"StaticProfiler::LoadExpanded": 10497.0,
|
| 788 |
+
"StaticProfiler::LocalizationEfficiency": 297.5042419433594,
|
| 789 |
+
"StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 352.84381103515625,
|
| 790 |
+
"StaticProfiler::StoreExpanded": 2561.0,
|
| 791 |
+
"StaticProfiler::TotalDMAExpanded": 13058.0,
|
| 792 |
+
"StaticProfiler::TotalDynamicInstancesCount": 2025.0,
|
| 793 |
+
"StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 2025.0,
|
| 794 |
+
"StaticProfiler::TotalLNCComm": 0.0,
|
| 795 |
+
"StaticProfiler::TotalLNCCommTransfer": 0.0,
|
| 796 |
+
"TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0,
|
| 797 |
+
"TilingProfiler::AveragePeUtilizationAfterTiling": 0.0,
|
| 798 |
+
"TilingProfiler::BatchnormInstructionsAfterTiling": 0.0,
|
| 799 |
+
"TilingProfiler::DmaInstructionsAfterTiling": 0.0,
|
| 800 |
+
"TilingProfiler::GenericInstructionsAfterTiling": 16.0,
|
| 801 |
+
"TilingProfiler::MatMultInstructionsAfterTiling": 1280.0,
|
| 802 |
+
"TilingProfiler::NumPfTransposes": 7.0,
|
| 803 |
+
"TilingProfiler::NumPfTransposesForIo": 3.0,
|
| 804 |
+
"TilingProfiler::NumPfTransposesForLocal": 2.0,
|
| 805 |
+
"TilingProfiler::NumPfTransposesForNonlocal": 2.0,
|
| 806 |
+
"TilingProfiler::PfTransposeInstructions": 116.0,
|
| 807 |
+
"TilingProfiler::PfTransposeInstructionsForIo": 36.0,
|
| 808 |
+
"TilingProfiler::PfTransposeInstructionsForLocal": 16.0,
|
| 809 |
+
"TilingProfiler::PfTransposeInstructionsForNonlocal": 64.0,
|
| 810 |
+
"TilingProfiler::ReduceInstructionsAfterTiling": 0.0,
|
| 811 |
+
"TilingProfiler::SimdInstructionsAfterTiling": 113.0,
|
| 812 |
+
"TilingProfiler::TotalInstructionsAfterTiling": 0.0,
|
| 813 |
+
"TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0,
|
| 814 |
+
"TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0,
|
| 815 |
+
"TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0,
|
| 816 |
+
"TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0,
|
| 817 |
+
"TransformConvOp::conv2d_column_packing": 0.0,
|
| 818 |
+
"TransformConvOp::conv2d_column_packing_1": 0.0,
|
| 819 |
+
"TransformConvOp::conv2d_column_packing_io10": 0.0,
|
| 820 |
+
"TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0
|
| 821 |
+
}
|
| 822 |
+
},
|
| 823 |
+
"sg0002": {
|
| 824 |
+
"compiletime": {
|
| 825 |
+
"AGOrderingAnalysisPass": 0.08984947204589844,
|
| 826 |
+
"AffinePredicateResolution": 0.0009312629699707031,
|
| 827 |
+
"AliasDependencyElimination": 0.00024366378784179688,
|
| 828 |
+
"AliasDependencyInduction": 0.005263328552246094,
|
| 829 |
+
"AliasDependencyReset": 0.04176759719848633,
|
| 830 |
+
"BFComputeCutting": 0.002216339111328125,
|
| 831 |
+
"BirCodeGenLoop": 0.3660314083099365,
|
| 832 |
+
"CCOpFusion": 0.04759931564331055,
|
| 833 |
+
"CanonicalizeDAGForPGTiling": 0.006819009780883789,
|
| 834 |
+
"CanonicalizeIR": 0.0015099048614501953,
|
| 835 |
+
"CoalesceCCOp": 0.007388591766357422,
|
| 836 |
+
"CommuteConcat": 0.0021598339080810547,
|
| 837 |
+
"DMALocalityOpt": 0.002432584762573242,
|
| 838 |
+
"DMAProfiler": 0.022784948348999023,
|
| 839 |
+
"DMATilingProfiler": 0.007287263870239258,
|
| 840 |
+
"DataLocalityOpt": 0.15184760093688965,
|
| 841 |
+
"DataStreaming": 0.007554292678833008,
|
| 842 |
+
"DeConcat": 0.0052378177642822266,
|
| 843 |
+
"DeadCodeElimination": 0.0020182132720947266,
|
| 844 |
+
"DeadStoreElimination": 0.007268428802490234,
|
| 845 |
+
"DelinearIndices": 0.006491422653198242,
|
| 846 |
+
"Delinearization": 0.00418853759765625,
|
| 847 |
+
"DelinearizeSPMD": 0.03150320053100586,
|
| 848 |
+
"DoNothing": 8.726119995117188e-05,
|
| 849 |
+
"DramToDramTranspose": 0.028717756271362305,
|
| 850 |
+
"DumpGraphAndMetadata": 0.04632568359375,
|
| 851 |
+
"EliminateDivs": 0.0021729469299316406,
|
| 852 |
+
"ExpandBatchNorm": 0.0017549991607666016,
|
| 853 |
+
"ExpandISAMacro": 0.0053784847259521484,
|
| 854 |
+
"FactorizeBlkDims": 0.046364784240722656,
|
| 855 |
+
"FactorizeThreadAxesInFreeDims": 0.0036237239837646484,
|
| 856 |
+
"FlattenMacroLoop": 0.012475728988647461,
|
| 857 |
+
"GenericAccessSimplifier": 0.0007128715515136719,
|
| 858 |
+
"InferInitValue": 0.11746096611022949,
|
| 859 |
+
"InferIntrinsicOnCC": 0.008626222610473633,
|
| 860 |
+
"InferNeuronTensor": 0.17520785331726074,
|
| 861 |
+
"InferNonlocalTensors": 0.02865004539489746,
|
| 862 |
+
"InferPSumTensor": 0.07464981079101563,
|
| 863 |
+
"InferShardAxis": 0.2832298278808594,
|
| 864 |
+
"InferSharedMemLoc": 0.01778268814086914,
|
| 865 |
+
"InlineNativeKernels": 0.0025413036346435547,
|
| 866 |
+
"InsertCoreBarrier": 0.007167816162109375,
|
| 867 |
+
"InsertIOTransposes": 0.058136701583862305,
|
| 868 |
+
"InsertImplicitShardAxisBeforeISel": 0.024377822875976563,
|
| 869 |
+
"InsertLocalTransposes": 0.016265153884887695,
|
| 870 |
+
"InsertOffloadedTransposes": 0.03376030921936035,
|
| 871 |
+
"LICM": 0.015621185302734375,
|
| 872 |
+
"LateLegalizeInst": 0.018033266067504883,
|
| 873 |
+
"LateLegalizePostSplit": 0.01734447479248047,
|
| 874 |
+
"LateLowerReshapeOp": 0.0016047954559326172,
|
| 875 |
+
"LateLowerTensorOp": 0.0011878013610839844,
|
| 876 |
+
"LateNeuronInstComb": 0.05313730239868164,
|
| 877 |
+
"LayoutPreprocessing": 0.05620622634887695,
|
| 878 |
+
"LayoutPreprocessingAndAnalysis": 0.18100428581237793,
|
| 879 |
+
"LayoutRequirementAnalysis": 0.014584064483642578,
|
| 880 |
+
"LegalizeCCOpLayout": 0.0032541751861572266,
|
| 881 |
+
"LegalizeOpLevelAlias": 0.0010030269622802734,
|
| 882 |
+
"LegalizePartitionReduce": 0.002452373504638672,
|
| 883 |
+
"LegalizeSundaAccess": 0.040776968002319336,
|
| 884 |
+
"LegalizeSundaMacro": 0.0427708625793457,
|
| 885 |
+
"LegalizeType": 0.016519784927368164,
|
| 886 |
+
"LocalLayoutOpt": 0.014898538589477539,
|
| 887 |
+
"LoopFusion": 0.005176067352294922,
|
| 888 |
+
"LoopSplitting": 0.00048732757568359375,
|
| 889 |
+
"LowerBroadcast": 0.004655599594116211,
|
| 890 |
+
"LowerCCOpBlockAxis": 0.004888296127319336,
|
| 891 |
+
"LowerComplexBroadcast": 0.010831594467163086,
|
| 892 |
+
"LowerIntrinsics": 0.03900289535522461,
|
| 893 |
+
"LowerShardAxis": 0.017355918884277344,
|
| 894 |
+
"LowerTensorOp": 0.013428449630737305,
|
| 895 |
+
"LowerToSendRecv": 0.038613319396972656,
|
| 896 |
+
"LowerTranspose": 0.050206661224365234,
|
| 897 |
+
"MacroGeneration": 0.1058506965637207,
|
| 898 |
+
"MaskPropagation": 0.004538536071777344,
|
| 899 |
+
"MemcpyElimination": 0.04629826545715332,
|
| 900 |
+
"MutateDataType": 0.0012559890747070313,
|
| 901 |
+
"NeuronAliasDependencyInduction": 0.0006165504455566406,
|
| 902 |
+
"NeuronAliasDependencyReset": 0.03877615928649902,
|
| 903 |
+
"NeuronInstComb": 0.02690267562866211,
|
| 904 |
+
"NeuronLICM": 0.024822473526000977,
|
| 905 |
+
"NeuronLoopFusion": 0.08438324928283691,
|
| 906 |
+
"NeuronLoopInterchange": 0.0028100013732910156,
|
| 907 |
+
"NeuronSimplifier": 0.0370326042175293,
|
| 908 |
+
"NeuronSimplifyPredicates": 0.017668962478637695,
|
| 909 |
+
"NeuronValueNumbering": 0.006052970886230469,
|
| 910 |
+
"OptimizeAliasedCopyChain": 0.0005040168762207031,
|
| 911 |
+
"OptimizeNKIKernels": 4.637849807739258,
|
| 912 |
+
"PAGLayoutOpt": 0.15427088737487793,
|
| 913 |
+
"PComputeCutting": 0.022019147872924805,
|
| 914 |
+
"PGLayoutTilingPipeline": 1.5585658550262451,
|
| 915 |
+
"PGTiling": 0.3059046268463135,
|
| 916 |
+
"PadElimination": 0.00058746337890625,
|
| 917 |
+
"ParAxesAnnotation": 0.07737350463867188,
|
| 918 |
+
"PartialLoopFusion": 0.03046131134033203,
|
| 919 |
+
"PartialSimdFusion": 0.008630514144897461,
|
| 920 |
+
"PerfectLoopNest": 0.0037374496459960938,
|
| 921 |
+
"RecognizeOpIdiom": 0.0049936771392822266,
|
| 922 |
+
"Recompute": 0.0004494190216064453,
|
| 923 |
+
"RelaxPredicates": 0.00769495964050293,
|
| 924 |
+
"Rematerialization": 0.0034401416778564453,
|
| 925 |
+
"RemoveShardedPartitionAxes": 0.008293628692626953,
|
| 926 |
+
"ReshapeWeights": 0.004475116729736328,
|
| 927 |
+
"ResolveAccessConflict": 0.0053598880767822266,
|
| 928 |
+
"ResolveComplicatePredicates": 0.0009164810180664063,
|
| 929 |
+
"RewriteReplicationMatmul": 0.00577545166015625,
|
| 930 |
+
"RewriteWeights": 0.010277271270751953,
|
| 931 |
+
"SFKVectorizer": 0.19967889785766602,
|
| 932 |
+
"ShardingPropagationAnalysis": 0.06793785095214844,
|
| 933 |
+
"SimpleAllReduceTiling": 0.004133701324462891,
|
| 934 |
+
"Simplifier": 0.0029976367950439453,
|
| 935 |
+
"SimplifyMacroPredicates": 0.025454998016357422,
|
| 936 |
+
"SimplifyNeuronTensor": 0.029609203338623047,
|
| 937 |
+
"SimplifySlice": 0.0008246898651123047,
|
| 938 |
+
"SimplifyTensor": 0.03260469436645508,
|
| 939 |
+
"SpillPSum": 0.01929450035095215,
|
| 940 |
+
"SplitAPUnionSets": 0.08632850646972656,
|
| 941 |
+
"SplitAccGrp": 0.002518892288208008,
|
| 942 |
+
"StaticProfiler": 0.026699542999267578,
|
| 943 |
+
"StaticTransposeLocalTensor": 0.009710550308227539,
|
| 944 |
+
"SundaISel": 0.08615612983703613,
|
| 945 |
+
"TCTransform": 0.0014863014221191406,
|
| 946 |
+
"TensorInitialization": 0.017354965209960938,
|
| 947 |
+
"TensorOpSimplifier": 0.004897356033325195,
|
| 948 |
+
"TensorOpTransform": 0.026237010955810547,
|
| 949 |
+
"TileCCOps": 0.007733821868896484,
|
| 950 |
+
"TilingProfiler": 0.03455352783203125,
|
| 951 |
+
"TransformConvOp": 0.0042724609375,
|
| 952 |
+
"TritiumFusion": 0.11825895309448242,
|
| 953 |
+
"ValueNumbering": 0.0019876956939697266,
|
| 954 |
+
"VectorizeDMA": 0.03213214874267578,
|
| 955 |
+
"VectorizeMatMult": 0.010382413864135742,
|
| 956 |
+
"WeightCoalescing": 0.003669261932373047,
|
| 957 |
+
"ZeroSizeTensorElimination": 0.00017881393432617188
|
| 958 |
+
},
|
| 959 |
+
"tensorizer": {
|
| 960 |
+
"DMATilingProfiler::TotalInstructionsAfterTiling": 20919.0,
|
| 961 |
+
"StaticProfiler::AifUb": 147.03309631347656,
|
| 962 |
+
"StaticProfiler::ArithmeticIntensityTensorizer": 141.05162048339844,
|
| 963 |
+
"StaticProfiler::AverageDmaLength": 2425.82958984375,
|
| 964 |
+
"StaticProfiler::AverageFractalPeUtilization": 98.71436309814453,
|
| 965 |
+
"StaticProfiler::AveragePartitionUtilization": 94.08551025390625,
|
| 966 |
+
"StaticProfiler::AveragePeUtilization": 96.60899353027344,
|
| 967 |
+
"StaticProfiler::DDRTransferBytes": 365941792.0,
|
| 968 |
+
"StaticProfiler::InternalTransferBytes": 325506848.0,
|
| 969 |
+
"StaticProfiler::LoadExpanded": 84060.0,
|
| 970 |
+
"StaticProfiler::LocalizationEfficiency": 95.931884765625,
|
| 971 |
+
"StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 99.52960968017578,
|
| 972 |
+
"StaticProfiler::StoreExpanded": 1898.0,
|
| 973 |
+
"StaticProfiler::TotalDMAExpanded": 85958.0,
|
| 974 |
+
"StaticProfiler::TotalDynamicInstancesCount": 25383.0,
|
| 975 |
+
"StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 24932.0,
|
| 976 |
+
"StaticProfiler::TotalLNCComm": 0.0,
|
| 977 |
+
"StaticProfiler::TotalLNCCommTransfer": 0.0,
|
| 978 |
+
"TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0,
|
| 979 |
+
"TilingProfiler::AveragePeUtilizationAfterTiling": 0.0,
|
| 980 |
+
"TilingProfiler::BatchnormInstructionsAfterTiling": 0.0,
|
| 981 |
+
"TilingProfiler::DmaInstructionsAfterTiling": 0.0,
|
| 982 |
+
"TilingProfiler::GenericInstructionsAfterTiling": 4.0,
|
| 983 |
+
"TilingProfiler::MatMultInstructionsAfterTiling": 10464.0,
|
| 984 |
+
"TilingProfiler::NumPfTransposes": 6.0,
|
| 985 |
+
"TilingProfiler::NumPfTransposesForIo": 1.0,
|
| 986 |
+
"TilingProfiler::NumPfTransposesForLocal": 1.0,
|
| 987 |
+
"TilingProfiler::NumPfTransposesForNonlocal": 4.0,
|
| 988 |
+
"TilingProfiler::PfTransposeInstructions": 10195.0,
|
| 989 |
+
"TilingProfiler::PfTransposeInstructionsForIo": 9504.0,
|
| 990 |
+
"TilingProfiler::PfTransposeInstructionsForLocal": 1.0,
|
| 991 |
+
"TilingProfiler::PfTransposeInstructionsForNonlocal": 690.0,
|
| 992 |
+
"TilingProfiler::ReduceInstructionsAfterTiling": 4.0,
|
| 993 |
+
"TilingProfiler::SimdInstructionsAfterTiling": 92.0,
|
| 994 |
+
"TilingProfiler::TotalInstructionsAfterTiling": 0.0,
|
| 995 |
+
"TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0,
|
| 996 |
+
"TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0,
|
| 997 |
+
"TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0,
|
| 998 |
+
"TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0,
|
| 999 |
+
"TransformConvOp::conv2d_column_packing": 0.0,
|
| 1000 |
+
"TransformConvOp::conv2d_column_packing_1": 0.0,
|
| 1001 |
+
"TransformConvOp::conv2d_column_packing_io10": 0.0,
|
| 1002 |
+
"TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0
|
| 1003 |
+
}
|
| 1004 |
+
},
|
| 1005 |
+
"sg01": {
|
| 1006 |
+
"compiletime": {
|
| 1007 |
+
"CanonicalizeConv": 2.499999936844688e-05,
|
| 1008 |
+
"CanonicalizeForTensorizer": 1.1000000085914508e-05,
|
| 1009 |
+
"Canonicalizer": 0.00020599999697878957,
|
| 1010 |
+
"HoistCompute": 1.9999999949504854e-06,
|
| 1011 |
+
"IdentifyCrossPassTensors": 1.2000000424450263e-05,
|
| 1012 |
+
"MemcastMotion": 7.999999979801942e-06,
|
| 1013 |
+
"PenguinizeFunctions": 9.999999747378752e-06,
|
| 1014 |
+
"PruneFunctions": 1.4999999621068127e-05,
|
| 1015 |
+
"RemoveOptimizationBarriers": 1.9999999494757503e-05,
|
| 1016 |
+
"ScatterMotion": 1.9999999949504854e-06,
|
| 1017 |
+
"TensorizerLegalizationPass": 1.4000000192027073e-05,
|
| 1018 |
+
"VerifySupportedOps": 9.999999747378752e-06,
|
| 1019 |
+
"algsimp": 4.999999873689376e-05,
|
| 1020 |
+
"batchnorm_expander": 1.1000000085914508e-05,
|
| 1021 |
+
"boundary-marker-removal": 3.999999989900971e-06,
|
| 1022 |
+
"call-inliner": 7.999999979801942e-06,
|
| 1023 |
+
"canonicalize-boundary-marker": 4.999999873689376e-06,
|
| 1024 |
+
"collective-stream-id-checker": 3.000000106112566e-06,
|
| 1025 |
+
"comparison-expander": 3.999999989900971e-06,
|
| 1026 |
+
"computation-deduplicator": 1.8000000636675395e-05,
|
| 1027 |
+
"config-lowering": 3.400000059627928e-05,
|
| 1028 |
+
"constant_folding": 9.000000318337698e-06,
|
| 1029 |
+
"cse": 9.999999747378752e-06,
|
| 1030 |
+
"dce": 9.999999974752427e-07,
|
| 1031 |
+
"dynamic-slice-transpose": 4.999999873689376e-06,
|
| 1032 |
+
"eliminate-redundant-compare": 1.2999999853491317e-05,
|
| 1033 |
+
"emit-offloaded-dropout": 1.2999999853491317e-05,
|
| 1034 |
+
"flatten-call-graph": 7.000000096013537e-06,
|
| 1035 |
+
"fuse-send-recv": 1.8000000636675395e-05,
|
| 1036 |
+
"hilo-conditional-to-select": 4.999999873689376e-06,
|
| 1037 |
+
"hilo::LegalizeAlias": 3.999999989900971e-06,
|
| 1038 |
+
"hilo::NeuronInstCombine": 5.6000000768108293e-05,
|
| 1039 |
+
"hilo::NeuronOpFusion": 2.300000051036477e-05,
|
| 1040 |
+
"hilo::ReplaceTokenTypeWithU8Pass": 1.2000000424450263e-05,
|
| 1041 |
+
"hilo::ScheduleFusion": 9.999999974752427e-07,
|
| 1042 |
+
"hilo::SixtyFourHack": 7.999999979801942e-06,
|
| 1043 |
+
"hilo::VerifyAliasing": 1.9999999949504854e-06,
|
| 1044 |
+
"hlo-mac-count": 8.900000102585182e-05,
|
| 1045 |
+
"legalize-ccops-for-tensorizer": 9.999999974752427e-07,
|
| 1046 |
+
"legalize-compare": 3.999999989900971e-06,
|
| 1047 |
+
"lower-argminmax-custom-call": 3.999999989900971e-06,
|
| 1048 |
+
"map-inline": 9.999999747378752e-06,
|
| 1049 |
+
"metadata-naming": 1.700000029813964e-05,
|
| 1050 |
+
"mlir::detail::OpToOpPassAdaptor": 1.8999999156221747e-05,
|
| 1051 |
+
"mlir::hlo::MhloToPyPenguin": 0.0009159999899566174,
|
| 1052 |
+
"mlir::mhlo::LowerComplexExtraPass": 6.900000153109431e-05,
|
| 1053 |
+
"mlir::mhlo::LowerComplexPass": 0.00011800000356743112,
|
| 1054 |
+
"native-to-custom-softmax": 4.999999873689376e-06,
|
| 1055 |
+
"native-to-custom-softmax-dx": 1.2999999853491317e-05,
|
| 1056 |
+
"neuron-hlo-verifier": 0.00035600000410340726,
|
| 1057 |
+
"operand_upcaster": 1.2000000424450263e-05,
|
| 1058 |
+
"post-par-pipe-begin": 9.999999974752427e-07,
|
| 1059 |
+
"post-par-pipe-end": 0.0,
|
| 1060 |
+
"post-partition-simplification": 0.0004619999963324517,
|
| 1061 |
+
"replace-minimum-constant": 6.000000212225132e-06,
|
| 1062 |
+
"reshape-mover": 3.000000106112566e-06,
|
| 1063 |
+
"simplify-concat": 4.199999966658652e-05,
|
| 1064 |
+
"simplify-while-loops": 1.9999999949504854e-06,
|
| 1065 |
+
"transform-variadic-reduce": 7.000000096013537e-06,
|
| 1066 |
+
"tuple-simplifier": 4.999999873689376e-06,
|
| 1067 |
+
"unpack-nested-aws-ntwsr": 3.000000106112566e-06,
|
| 1068 |
+
"unroll-while-loop": 0.0
|
| 1069 |
+
},
|
| 1070 |
+
"hilo": {
|
| 1071 |
+
"ArithmeticIntensity": 105.0946273803711,
|
| 1072 |
+
"HloMacCount": 6509559808.0,
|
| 1073 |
+
"Traffic": 123879968.0
|
| 1074 |
+
}
|
| 1075 |
+
},
|
| 1076 |
+
"sg02": {
|
| 1077 |
+
"compiletime": {
|
| 1078 |
+
"CanonicalizeConv": 0.0,
|
| 1079 |
+
"CanonicalizeForTensorizer": 1.2999999853491317e-05,
|
| 1080 |
+
"Canonicalizer": 0.0002699999895412475,
|
| 1081 |
+
"HoistCompute": 4.999999873689376e-06,
|
| 1082 |
+
"IdentifyCrossPassTensors": 1.2000000424450263e-05,
|
| 1083 |
+
"MemcastMotion": 3.000000106112566e-06,
|
| 1084 |
+
"PenguinizeFunctions": 1.2000000424450263e-05,
|
| 1085 |
+
"PruneFunctions": 1.700000029813964e-05,
|
| 1086 |
+
"RemoveOptimizationBarriers": 7.000000096013537e-06,
|
| 1087 |
+
"ScatterMotion": 0.0,
|
| 1088 |
+
"TensorizerLegalizationPass": 7.999999979801942e-06,
|
| 1089 |
+
"VerifySupportedOps": 1.2000000424450263e-05,
|
| 1090 |
+
"algsimp": 4.8000001697801054e-05,
|
| 1091 |
+
"batchnorm_expander": 1.2999999853491317e-05,
|
| 1092 |
+
"boundary-marker-removal": 3.999999989900971e-06,
|
| 1093 |
+
"call-inliner": 9.000000318337698e-06,
|
| 1094 |
+
"canonicalize-boundary-marker": 4.999999873689376e-06,
|
| 1095 |
+
"collective-stream-id-checker": 3.000000106112566e-06,
|
| 1096 |
+
"comparison-expander": 2.4000000848900527e-05,
|
| 1097 |
+
"computation-deduplicator": 2.300000051036477e-05,
|
| 1098 |
+
"config-lowering": 3.400000059627928e-05,
|
| 1099 |
+
"constant_folding": 7.000000096013537e-06,
|
| 1100 |
+
"cse": 1.2999999853491317e-05,
|
| 1101 |
+
"dce": 9.999999974752427e-07,
|
| 1102 |
+
"dynamic-slice-transpose": 4.999999873689376e-06,
|
| 1103 |
+
"eliminate-redundant-compare": 3.000000106112566e-06,
|
| 1104 |
+
"emit-offloaded-dropout": 1.2000000424450263e-05,
|
| 1105 |
+
"flatten-call-graph": 1.1000000085914508e-05,
|
| 1106 |
+
"fuse-send-recv": 1.8999999156221747e-05,
|
| 1107 |
+
"hilo-conditional-to-select": 4.999999873689376e-06,
|
| 1108 |
+
"hilo::LegalizeAlias": 1.9999999949504854e-06,
|
| 1109 |
+
"hilo::NeuronInstCombine": 6.399999983841553e-05,
|
| 1110 |
+
"hilo::NeuronOpFusion": 9.999999747378752e-06,
|
| 1111 |
+
"hilo::ReplaceTokenTypeWithU8Pass": 1.8999999156221747e-05,
|
| 1112 |
+
"hilo::ScheduleFusion": 1.9999999949504854e-06,
|
| 1113 |
+
"hilo::SixtyFourHack": 4.400000034365803e-05,
|
| 1114 |
+
"hilo::VerifyAliasing": 9.999999974752427e-07,
|
| 1115 |
+
"hlo-mac-count": 0.004759000148624182,
|
| 1116 |
+
"legalize-ccops-for-tensorizer": 9.999999974752427e-07,
|
| 1117 |
+
"legalize-compare": 3.000000106112566e-06,
|
| 1118 |
+
"lower-argminmax-custom-call": 3.000000106112566e-06,
|
| 1119 |
+
"map-inline": 1.2999999853491317e-05,
|
| 1120 |
+
"metadata-naming": 1.5999999959603883e-05,
|
| 1121 |
+
"mlir::detail::OpToOpPassAdaptor": 2.4000000848900527e-05,
|
| 1122 |
+
"mlir::hlo::MhloToPyPenguin": 0.005001000128686428,
|
| 1123 |
+
"mlir::mhlo::LowerComplexExtraPass": 7.79999973019585e-05,
|
| 1124 |
+
"mlir::mhlo::LowerComplexPass": 1.1000000085914508e-05,
|
| 1125 |
+
"native-to-custom-softmax": 4.999999873689376e-06,
|
| 1126 |
+
"native-to-custom-softmax-dx": 1.2999999853491317e-05,
|
| 1127 |
+
"neuron-hlo-verifier": 0.0003600000054575503,
|
| 1128 |
+
"operand_upcaster": 1.2000000424450263e-05,
|
| 1129 |
+
"post-par-pipe-begin": 9.999999974752427e-07,
|
| 1130 |
+
"post-par-pipe-end": 0.0,
|
| 1131 |
+
"post-partition-simplification": 0.00044999999227002263,
|
| 1132 |
+
"replace-minimum-constant": 7.000000096013537e-06,
|
| 1133 |
+
"reshape-mover": 3.000000106112566e-06,
|
| 1134 |
+
"simplify-concat": 3.199999991920777e-05,
|
| 1135 |
+
"simplify-while-loops": 1.9999999949504854e-06,
|
| 1136 |
+
"transform-variadic-reduce": 4.8000001697801054e-05,
|
| 1137 |
+
"tuple-simplifier": 3.999999989900971e-06,
|
| 1138 |
+
"unpack-nested-aws-ntwsr": 4.999999873689376e-06,
|
| 1139 |
+
"unroll-while-loop": 0.0
|
| 1140 |
+
},
|
| 1141 |
+
"hilo": {
|
| 1142 |
+
"ArithmeticIntensity": 28.312292098999023,
|
| 1143 |
+
"HloMacCount": 4988469248.0,
|
| 1144 |
+
"Traffic": 352388928.0
|
| 1145 |
+
}
|
| 1146 |
+
},
|
| 1147 |
+
"topk": {
|
| 1148 |
+
"compiletime": {
|
| 1149 |
+
"CoalesceCCOp": 0.006628990173339844,
|
| 1150 |
+
"DMALocalityOpt": 0.003807544708251953,
|
| 1151 |
+
"DMAProfiler": 0.007816553115844727,
|
| 1152 |
+
"DataStreaming": 0.022742509841918945,
|
| 1153 |
+
"DoNothing": 0.00023865699768066406,
|
| 1154 |
+
"ExpandISAMacro": 0.0065212249755859375,
|
| 1155 |
+
"FactorizeBlkDims": 0.026747465133666992,
|
| 1156 |
+
"InferPSumTensor": 0.02189779281616211,
|
| 1157 |
+
"InferSharedMemLoc": 0.0063364505767822266,
|
| 1158 |
+
"InsertCoreBarrier": 0.006017923355102539,
|
| 1159 |
+
"LateLegalizeInst": 0.014620304107666016,
|
| 1160 |
+
"LateNeuronInstComb": 0.0202789306640625,
|
| 1161 |
+
"LegalizeSundaAccess": 0.028186798095703125,
|
| 1162 |
+
"LegalizeType": 0.019533634185791016,
|
| 1163 |
+
"LowerBroadcast": 0.013374805450439453,
|
| 1164 |
+
"LowerIntrinsics": 0.010933876037597656,
|
| 1165 |
+
"LowerTranspose": 0.00744938850402832,
|
| 1166 |
+
"NeuronInstComb": 0.02654409408569336,
|
| 1167 |
+
"NeuronLICM": 0.021889686584472656,
|
| 1168 |
+
"NeuronSimplifyPredicates": 0.006708860397338867,
|
| 1169 |
+
"NeuronValueNumbering": 0.007520914077758789,
|
| 1170 |
+
"SFKVectorizer": 0.06228280067443848,
|
| 1171 |
+
"SimpleAllReduceTiling": 0.00653386116027832,
|
| 1172 |
+
"SimplifyNeuronTensor": 0.09801602363586426,
|
| 1173 |
+
"SpillPSum": 0.04999828338623047,
|
| 1174 |
+
"WeightCoalescing": 0.0065784454345703125
|
| 1175 |
+
}
|
| 1176 |
+
}
|
| 1177 |
+
}
|
context_encoding_model/_tp0_bk1/graph.neff
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b96dea22dba97fdfefb2f26f7ad03c509af0a395c08e4bfb143ff14bd673c826
|
| 3 |
+
size 1229824
|
context_encoding_model/_tp0_bk1/log-neuron-cc.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
context_encoding_model/_tp0_bk1/metaneff.pb
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:456dc08330072407208f8e4a41b70cc9190b30d05dced01f768e2bbc43e5076d
|
| 3 |
+
size 2438380
|
context_encoding_model/_tp0_bk1/model.MODULE_2330bfb0632c950ddab1+62ecd68b.hlo_module.pb
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c4a7565239b86e91fc95d8ad2ceb0bdd0fa2489c90c536cf87cd40f007ac5d60
|
| 3 |
+
size 2525166
|
context_encoding_model/_tp0_bk1/model.MODULE_2330bfb0632c950ddab1+62ecd68b.neff
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b96dea22dba97fdfefb2f26f7ad03c509af0a395c08e4bfb143ff14bd673c826
|
| 3 |
+
size 1229824
|
context_encoding_model/_tp0_bk1/neuron_config.json
ADDED
|
@@ -0,0 +1,224 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_attn_implementation_autoset": false,
|
| 3 |
+
"_name_or_path": "/home/ubuntu/models/Qwen3-1.7B",
|
| 4 |
+
"add_cross_attention": false,
|
| 5 |
+
"architectures": [
|
| 6 |
+
"Qwen3ForCausalLM"
|
| 7 |
+
],
|
| 8 |
+
"attention_bias": false,
|
| 9 |
+
"attention_dropout": 0.0,
|
| 10 |
+
"attribute_map": {},
|
| 11 |
+
"bad_words_ids": null,
|
| 12 |
+
"begin_suppress_tokens": null,
|
| 13 |
+
"bos_token_id": 151643,
|
| 14 |
+
"chunk_size_feed_forward": 0,
|
| 15 |
+
"cross_attention_hidden_size": null,
|
| 16 |
+
"decoder_start_token_id": null,
|
| 17 |
+
"diversity_penalty": 0.0,
|
| 18 |
+
"do_sample": false,
|
| 19 |
+
"early_stopping": false,
|
| 20 |
+
"encoder_no_repeat_ngram_size": 0,
|
| 21 |
+
"eos_token_id": 151645,
|
| 22 |
+
"exponential_decay_length_penalty": null,
|
| 23 |
+
"finetuning_task": null,
|
| 24 |
+
"forced_bos_token_id": null,
|
| 25 |
+
"forced_eos_token_id": null,
|
| 26 |
+
"fused_spec_config": null,
|
| 27 |
+
"head_dim": 128,
|
| 28 |
+
"hidden_act": "silu",
|
| 29 |
+
"hidden_size": 2048,
|
| 30 |
+
"id2label": {
|
| 31 |
+
"0": "LABEL_0",
|
| 32 |
+
"1": "LABEL_1"
|
| 33 |
+
},
|
| 34 |
+
"initializer_range": 0.02,
|
| 35 |
+
"intermediate_size": 6144,
|
| 36 |
+
"is_decoder": false,
|
| 37 |
+
"is_encoder_decoder": false,
|
| 38 |
+
"label2id": {
|
| 39 |
+
"LABEL_0": 0,
|
| 40 |
+
"LABEL_1": 1
|
| 41 |
+
},
|
| 42 |
+
"length_penalty": 1.0,
|
| 43 |
+
"max_length": 20,
|
| 44 |
+
"max_position_embeddings": 40960,
|
| 45 |
+
"max_window_layers": 28,
|
| 46 |
+
"metadata": null,
|
| 47 |
+
"min_length": 0,
|
| 48 |
+
"model_type": "qwen3",
|
| 49 |
+
"neuron_config": {
|
| 50 |
+
"activation_quantization_type": null,
|
| 51 |
+
"allow_input_truncation": false,
|
| 52 |
+
"apply_seq_ids_mask": false,
|
| 53 |
+
"async_mode": false,
|
| 54 |
+
"attention_dp_degree": 1,
|
| 55 |
+
"attention_dtype": null,
|
| 56 |
+
"attn_block_cte_nki_kernel_enabled": false,
|
| 57 |
+
"attn_block_tkg_nki_kernel_cache_update": false,
|
| 58 |
+
"attn_block_tkg_nki_kernel_cascaded_attention": false,
|
| 59 |
+
"attn_block_tkg_nki_kernel_enabled": false,
|
| 60 |
+
"attn_cls": {
|
| 61 |
+
"__module__": "neuronx_distributed_inference.models.qwen3.modeling_qwen3",
|
| 62 |
+
"__name__": "NeuronQwen3Attention"
|
| 63 |
+
},
|
| 64 |
+
"attn_kernel_enabled": null,
|
| 65 |
+
"attn_tkg_builtin_kernel_enabled": false,
|
| 66 |
+
"attn_tkg_nki_kernel_enabled": false,
|
| 67 |
+
"batch_size": 1,
|
| 68 |
+
"bucket_n_active_tokens": true,
|
| 69 |
+
"buckets": [
|
| 70 |
+
256
|
| 71 |
+
],
|
| 72 |
+
"cast_type": "config",
|
| 73 |
+
"cc_pipeline_tiling_factor": 2,
|
| 74 |
+
"chunked_prefill_config": null,
|
| 75 |
+
"context_encoding_buckets": [
|
| 76 |
+
256
|
| 77 |
+
],
|
| 78 |
+
"cp_degree": 1,
|
| 79 |
+
"ctx_batch_size": 1,
|
| 80 |
+
"disable_kv_cache_tiling": false,
|
| 81 |
+
"draft_model_modules_to_not_convert": null,
|
| 82 |
+
"enable_bucketing": true,
|
| 83 |
+
"enable_cte_modular_flow": false,
|
| 84 |
+
"enable_eagle_draft_input_norm": false,
|
| 85 |
+
"enable_eagle_speculation": false,
|
| 86 |
+
"enable_fused_speculation": false,
|
| 87 |
+
"enable_long_context_mode": false,
|
| 88 |
+
"enable_output_completion_notifications": false,
|
| 89 |
+
"enable_spill_reload_dge": false,
|
| 90 |
+
"enable_token_tree": false,
|
| 91 |
+
"ep_degree": 1,
|
| 92 |
+
"expert_mlp_nki_kernel_enabled": null,
|
| 93 |
+
"flash_decoding_enabled": false,
|
| 94 |
+
"fused_qkv": false,
|
| 95 |
+
"fused_rmsnorm_skip_gamma": false,
|
| 96 |
+
"is_block_kv_layout": null,
|
| 97 |
+
"is_chunked_prefill": false,
|
| 98 |
+
"is_continuous_batching": true,
|
| 99 |
+
"is_eagle_draft": false,
|
| 100 |
+
"is_medusa": false,
|
| 101 |
+
"is_prefill_stage": true,
|
| 102 |
+
"is_prefix_caching": false,
|
| 103 |
+
"k_cache_transposed": false,
|
| 104 |
+
"kv_cache_batch_size": 8,
|
| 105 |
+
"kv_cache_padding_size": 0,
|
| 106 |
+
"kv_cache_quant": false,
|
| 107 |
+
"kv_cache_tiling": false,
|
| 108 |
+
"layer_boundary_markers": false,
|
| 109 |
+
"lm_head_pad": true,
|
| 110 |
+
"lm_head_pad_alignment_size": 1,
|
| 111 |
+
"local_ranks_size": 2,
|
| 112 |
+
"logical_nc_config": 2,
|
| 113 |
+
"lora_config": null,
|
| 114 |
+
"max_batch_size": 8,
|
| 115 |
+
"max_context_length": 4096,
|
| 116 |
+
"max_length": 4096,
|
| 117 |
+
"max_new_tokens": null,
|
| 118 |
+
"medusa_speculation_length": 0,
|
| 119 |
+
"medusa_tree": null,
|
| 120 |
+
"mlp_kernel_enabled": false,
|
| 121 |
+
"mlp_kernel_fuse_residual_add": false,
|
| 122 |
+
"modules_to_not_convert": null,
|
| 123 |
+
"moe_fused_nki_kernel_enabled": null,
|
| 124 |
+
"n_active_tokens": 4096,
|
| 125 |
+
"n_positions": 4096,
|
| 126 |
+
"num_medusa_heads": 0,
|
| 127 |
+
"on_cpu": false,
|
| 128 |
+
"on_device_sampling_config": {
|
| 129 |
+
"deterministic": false,
|
| 130 |
+
"do_sample": false,
|
| 131 |
+
"dynamic": true,
|
| 132 |
+
"global_topk": 256,
|
| 133 |
+
"on_device_sampling_config": true,
|
| 134 |
+
"temperature": 1.0,
|
| 135 |
+
"top_k": 1,
|
| 136 |
+
"top_k_kernel_enabled": false,
|
| 137 |
+
"top_p": 1.0
|
| 138 |
+
},
|
| 139 |
+
"output_logits": false,
|
| 140 |
+
"overrides_torch_dtype": true,
|
| 141 |
+
"pa_block_size": 4096,
|
| 142 |
+
"pa_num_blocks": 8,
|
| 143 |
+
"padding_side": "right",
|
| 144 |
+
"pp_degree": 1,
|
| 145 |
+
"prefix_buckets": null,
|
| 146 |
+
"qk_layernorm": false,
|
| 147 |
+
"qkv_kernel_enabled": false,
|
| 148 |
+
"qkv_kernel_fuse_residual_add": false,
|
| 149 |
+
"qkv_kernel_nbsd_layout": false,
|
| 150 |
+
"quantization_dtype": "int8",
|
| 151 |
+
"quantization_type": "per_tensor_symmetric",
|
| 152 |
+
"quantize_clamp_bound": Infinity,
|
| 153 |
+
"quantized": false,
|
| 154 |
+
"quantized_checkpoints_path": null,
|
| 155 |
+
"quantized_mlp_kernel_enabled": false,
|
| 156 |
+
"rmsnorm_quantize_kernel_enabled": false,
|
| 157 |
+
"router_topk_nki_kernel_enabled": null,
|
| 158 |
+
"rpl_reduce_dtype": null,
|
| 159 |
+
"save_sharded_checkpoint": true,
|
| 160 |
+
"scratchpad_page_size": null,
|
| 161 |
+
"seq_len": 4096,
|
| 162 |
+
"seq_len_threshold_for_cc_tiling": 16384,
|
| 163 |
+
"sequence_parallel_enabled": false,
|
| 164 |
+
"shared_mlp_nki_kernel_enabled": null,
|
| 165 |
+
"skip_sharding": false,
|
| 166 |
+
"skip_warmup": false,
|
| 167 |
+
"spec_batch_size": 8,
|
| 168 |
+
"speculation_length": 0,
|
| 169 |
+
"start_rank_id": 0,
|
| 170 |
+
"strided_context_parallel_kernel_enabled": false,
|
| 171 |
+
"target": null,
|
| 172 |
+
"tensor_capture_config": null,
|
| 173 |
+
"tile_cc": false,
|
| 174 |
+
"tkg_batch_size": 8,
|
| 175 |
+
"token_generation_buckets": null,
|
| 176 |
+
"token_tree_config": null,
|
| 177 |
+
"torch_dtype": "bfloat16",
|
| 178 |
+
"tp_degree": 2,
|
| 179 |
+
"vocab_parallel": false,
|
| 180 |
+
"weight_gather_seq_len_threshold": 32768,
|
| 181 |
+
"weights_to_skip_layout_optimization": [],
|
| 182 |
+
"world_size": 2
|
| 183 |
+
},
|
| 184 |
+
"no_repeat_ngram_size": 0,
|
| 185 |
+
"num_attention_heads": 16,
|
| 186 |
+
"num_beam_groups": 1,
|
| 187 |
+
"num_beams": 1,
|
| 188 |
+
"num_cores_per_group": 1,
|
| 189 |
+
"num_hidden_layers": 28,
|
| 190 |
+
"num_key_value_heads": 8,
|
| 191 |
+
"num_return_sequences": 1,
|
| 192 |
+
"output_attentions": false,
|
| 193 |
+
"output_hidden_states": false,
|
| 194 |
+
"output_scores": false,
|
| 195 |
+
"pad_token_id": 0,
|
| 196 |
+
"prefix": null,
|
| 197 |
+
"problem_type": null,
|
| 198 |
+
"pruned_heads": {},
|
| 199 |
+
"remove_invalid_values": false,
|
| 200 |
+
"repetition_penalty": 1.0,
|
| 201 |
+
"return_dict": true,
|
| 202 |
+
"return_dict_in_generate": false,
|
| 203 |
+
"rms_norm_eps": 1e-06,
|
| 204 |
+
"rope_scaling": null,
|
| 205 |
+
"rope_theta": 1000000,
|
| 206 |
+
"sep_token_id": null,
|
| 207 |
+
"sliding_window": null,
|
| 208 |
+
"suppress_tokens": null,
|
| 209 |
+
"task_specific_params": null,
|
| 210 |
+
"temperature": 1.0,
|
| 211 |
+
"tf_legacy_loss": false,
|
| 212 |
+
"tie_encoder_decoder": false,
|
| 213 |
+
"tie_word_embeddings": true,
|
| 214 |
+
"tokenizer_class": null,
|
| 215 |
+
"top_k": 50,
|
| 216 |
+
"top_p": 1.0,
|
| 217 |
+
"torchscript": false,
|
| 218 |
+
"transformers_version": "4.51.0",
|
| 219 |
+
"typical_p": 1.0,
|
| 220 |
+
"use_bfloat16": false,
|
| 221 |
+
"use_cache": true,
|
| 222 |
+
"use_sliding_window": false,
|
| 223 |
+
"vocab_size": 151936
|
| 224 |
+
}
|
context_encoding_model/_tp0_bk2/command.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
neuronx-cc compile --framework=XLA model.MODULE_49bb42f69f5b159ae769+3467f95e.hlo_module.pb --output model.MODULE_49bb42f69f5b159ae769+3467f95e.neff --target=trn2 --auto-cast=none --model-type=transformer '--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma ' --lnc=2 -O1 '--internal-hlo2tensorizer-options= --modular-flow-mac-threshold=10 --verify-hlo=true' --logfile=log-neuron-cc.txt --verbose=35
|
context_encoding_model/_tp0_bk2/compile_flags.MODULE_49bb42f69f5b159ae769+3467f95e.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
["--target=trn2", "--auto-cast=none", "--model-type=transformer", "--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma ", "--lnc=2", "-O1", "--internal-hlo2tensorizer-options= --modular-flow-mac-threshold=10 --verify-hlo=true", "--logfile=/home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk2/log-neuron-cc.txt"]
|
context_encoding_model/_tp0_bk2/global_metric_store.json
ADDED
|
@@ -0,0 +1,1177 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"Average": {
|
| 3 |
+
"tensorizer": {
|
| 4 |
+
"StaticProfiler::AverageFractalPeUtilization": 98.77135467529297,
|
| 5 |
+
"StaticProfiler::AveragePartitionUtilization": 94.32398223876953,
|
| 6 |
+
"StaticProfiler::AveragePeUtilization": 96.75625610351563,
|
| 7 |
+
"StaticProfiler::LocalizationEfficiency": 86.58112335205078,
|
| 8 |
+
"StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 89.48306274414063,
|
| 9 |
+
"TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0,
|
| 10 |
+
"TilingProfiler::AveragePeUtilizationAfterTiling": 0.0
|
| 11 |
+
}
|
| 12 |
+
},
|
| 13 |
+
"Count": {
|
| 14 |
+
"tensorizer": {
|
| 15 |
+
"StaticProfiler::AverageFractalPeUtilization": 1.0,
|
| 16 |
+
"StaticProfiler::AveragePartitionUtilization": 1.0,
|
| 17 |
+
"StaticProfiler::AveragePeUtilization": 1.0,
|
| 18 |
+
"StaticProfiler::LocalizationEfficiency": 1.0,
|
| 19 |
+
"StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 1.0,
|
| 20 |
+
"TilingProfiler::AveragePartitionUtilizationAfterTiling": 1.0,
|
| 21 |
+
"TilingProfiler::AveragePeUtilizationAfterTiling": 1.0
|
| 22 |
+
}
|
| 23 |
+
},
|
| 24 |
+
"Sum": {
|
| 25 |
+
"compiletime": {
|
| 26 |
+
"AGOrderingAnalysisPass": 0.07081985473632813,
|
| 27 |
+
"AffinePredicateResolution": 0.001847982406616211,
|
| 28 |
+
"AliasDependencyElimination": 0.0017039775848388672,
|
| 29 |
+
"AliasDependencyInduction": 0.016176223754882813,
|
| 30 |
+
"AliasDependencyReset": 0.0533907413482666,
|
| 31 |
+
"BFComputeCutting": 0.002690553665161133,
|
| 32 |
+
"BirCodeGenLoop": 0.436786413192749,
|
| 33 |
+
"CCOpFusion": 0.05509161949157715,
|
| 34 |
+
"CanonicalizeConv": 2.099999983329326e-05,
|
| 35 |
+
"CanonicalizeDAGForPGTiling": 0.01196432113647461,
|
| 36 |
+
"CanonicalizeForTensorizer": 4.0000002627493814e-05,
|
| 37 |
+
"CanonicalizeIR": 0.002866029739379883,
|
| 38 |
+
"Canonicalizer": 0.000770999991800636,
|
| 39 |
+
"CoalesceCCOp": 0.02091670036315918,
|
| 40 |
+
"CommuteConcat": 0.0016961097717285156,
|
| 41 |
+
"DMALocalityOpt": 0.012746095657348633,
|
| 42 |
+
"DMAProfiler": 0.025209903717041016,
|
| 43 |
+
"DMATilingProfiler": 0.013326406478881836,
|
| 44 |
+
"DataLocalityOpt": 0.13399314880371094,
|
| 45 |
+
"DataStreaming": 0.02252793312072754,
|
| 46 |
+
"DeConcat": 0.003023386001586914,
|
| 47 |
+
"DeadCodeElimination": 0.006216287612915039,
|
| 48 |
+
"DeadStoreElimination": 0.01400136947631836,
|
| 49 |
+
"DelinearIndices": 0.014129638671875,
|
| 50 |
+
"Delinearization": 0.004580259323120117,
|
| 51 |
+
"DelinearizeSPMD": 0.02204442024230957,
|
| 52 |
+
"DoNothing": 0.0005753040313720703,
|
| 53 |
+
"DramToDramTranspose": 0.0199737548828125,
|
| 54 |
+
"DumpGraphAndMetadata": 0.037271738052368164,
|
| 55 |
+
"EliminateDivs": 0.0025110244750976563,
|
| 56 |
+
"ExpandBatchNorm": 0.002251148223876953,
|
| 57 |
+
"ExpandISAMacro": 0.012173652648925781,
|
| 58 |
+
"FactorizeBlkDims": 0.041153669357299805,
|
| 59 |
+
"FactorizeThreadAxesInFreeDims": 0.0031156539916992188,
|
| 60 |
+
"FlattenMacroLoop": 0.005499601364135742,
|
| 61 |
+
"GenericAccessSimplifier": 0.004717350006103516,
|
| 62 |
+
"HoistCompute": 6.999999641266186e-06,
|
| 63 |
+
"IdentifyCrossPassTensors": 4.70000013592653e-05,
|
| 64 |
+
"InferInitValue": 0.046659231185913086,
|
| 65 |
+
"InferIntrinsicOnCC": 0.039793968200683594,
|
| 66 |
+
"InferNeuronTensor": 0.03774452209472656,
|
| 67 |
+
"InferNonlocalTensors": 0.030941486358642578,
|
| 68 |
+
"InferPSumTensor": 0.12924981117248535,
|
| 69 |
+
"InferShardAxis": 0.504509449005127,
|
| 70 |
+
"InferSharedMemLoc": 0.03389143943786621,
|
| 71 |
+
"InlineNativeKernels": 0.00193023681640625,
|
| 72 |
+
"InsertCoreBarrier": 0.019978046417236328,
|
| 73 |
+
"InsertIOTransposes": 0.061508893966674805,
|
| 74 |
+
"InsertImplicitShardAxisBeforeISel": 0.01612401008605957,
|
| 75 |
+
"InsertLocalTransposes": 0.005467414855957031,
|
| 76 |
+
"InsertOffloadedTransposes": 0.025030136108398438,
|
| 77 |
+
"LICM": 0.010097026824951172,
|
| 78 |
+
"LateLegalizeInst": 0.033937692642211914,
|
| 79 |
+
"LateLegalizePostSplit": 0.020189762115478516,
|
| 80 |
+
"LateLowerReshapeOp": 0.0018696784973144531,
|
| 81 |
+
"LateLowerTensorOp": 0.0022716522216796875,
|
| 82 |
+
"LateNeuronInstComb": 0.060944557189941406,
|
| 83 |
+
"LayoutPreprocessing": 0.05716848373413086,
|
| 84 |
+
"LayoutPreprocessingAndAnalysis": 0.12559008598327637,
|
| 85 |
+
"LayoutRequirementAnalysis": 0.01263284683227539,
|
| 86 |
+
"LegalizeCCOpLayout": 0.003709077835083008,
|
| 87 |
+
"LegalizeOpLevelAlias": 0.0016541481018066406,
|
| 88 |
+
"LegalizePartitionReduce": 0.007805347442626953,
|
| 89 |
+
"LegalizeSundaAccess": 0.13506388664245605,
|
| 90 |
+
"LegalizeSundaMacro": 0.020558595657348633,
|
| 91 |
+
"LegalizeType": 0.04366302490234375,
|
| 92 |
+
"LocalLayoutOpt": 0.04371356964111328,
|
| 93 |
+
"LoopFusion": 0.03305792808532715,
|
| 94 |
+
"LoopSplitting": 0.0017974376678466797,
|
| 95 |
+
"LowerBroadcast": 0.015467643737792969,
|
| 96 |
+
"LowerCCOpBlockAxis": 0.013673782348632813,
|
| 97 |
+
"LowerComplexBroadcast": 0.005238771438598633,
|
| 98 |
+
"LowerIntrinsics": 0.059927940368652344,
|
| 99 |
+
"LowerShardAxis": 0.02148151397705078,
|
| 100 |
+
"LowerTensorOp": 0.011847496032714844,
|
| 101 |
+
"LowerToSendRecv": 0.03099536895751953,
|
| 102 |
+
"LowerTranspose": 0.026517152786254883,
|
| 103 |
+
"MacroGeneration": 0.11886835098266602,
|
| 104 |
+
"MaskPropagation": 0.01356053352355957,
|
| 105 |
+
"MemcastMotion": 1.799999881768599e-05,
|
| 106 |
+
"MemcpyElimination": 0.050164222717285156,
|
| 107 |
+
"MutateDataType": 0.0028362274169921875,
|
| 108 |
+
"NeuronAliasDependencyInduction": 0.0024106502532958984,
|
| 109 |
+
"NeuronAliasDependencyReset": 0.07959818840026855,
|
| 110 |
+
"NeuronInstComb": 0.05623912811279297,
|
| 111 |
+
"NeuronLICM": 0.06090664863586426,
|
| 112 |
+
"NeuronLoopFusion": 0.0700373649597168,
|
| 113 |
+
"NeuronLoopInterchange": 0.003496885299682617,
|
| 114 |
+
"NeuronSimplifier": 0.0175168514251709,
|
| 115 |
+
"NeuronSimplifyPredicates": 0.035622596740722656,
|
| 116 |
+
"NeuronValueNumbering": 0.02324056625366211,
|
| 117 |
+
"OptimizeAliasedCopyChain": 0.0008881092071533203,
|
| 118 |
+
"OptimizeNKIKernels": 4.497897148132324,
|
| 119 |
+
"PAGLayoutOpt": 0.11170005798339844,
|
| 120 |
+
"PComputeCutting": 0.02699899673461914,
|
| 121 |
+
"PGLayoutTilingPipeline": 1.7730352878570557,
|
| 122 |
+
"PGTiling": 0.4928562641143799,
|
| 123 |
+
"PadElimination": 0.0005004405975341797,
|
| 124 |
+
"ParAxesAnnotation": 0.08141517639160156,
|
| 125 |
+
"PartialLoopFusion": 0.05184769630432129,
|
| 126 |
+
"PartialSimdFusion": 0.019034385681152344,
|
| 127 |
+
"PenguinizeFunctions": 3.7000001611886546e-05,
|
| 128 |
+
"PerfectLoopNest": 0.005218982696533203,
|
| 129 |
+
"PruneFunctions": 3.7999998312443495e-05,
|
| 130 |
+
"RecognizeOpIdiom": 0.028120994567871094,
|
| 131 |
+
"Recompute": 0.0006320476531982422,
|
| 132 |
+
"RelaxPredicates": 0.012555122375488281,
|
| 133 |
+
"Rematerialization": 0.002846240997314453,
|
| 134 |
+
"RemoveOptimizationBarriers": 8.199999865610152e-05,
|
| 135 |
+
"RemoveShardedPartitionAxes": 0.028553009033203125,
|
| 136 |
+
"ReshapeWeights": 0.0013833045959472656,
|
| 137 |
+
"ResolveAccessConflict": 0.007452726364135742,
|
| 138 |
+
"ResolveComplicatePredicates": 0.002027273178100586,
|
| 139 |
+
"RewriteReplicationMatmul": 0.0019905567169189453,
|
| 140 |
+
"RewriteWeights": 0.005997419357299805,
|
| 141 |
+
"SFKVectorizer": 0.2772505283355713,
|
| 142 |
+
"ScatterMotion": 2.300000051036477e-05,
|
| 143 |
+
"ShardingPropagationAnalysis": 0.11750531196594238,
|
| 144 |
+
"SimpleAllReduceTiling": 0.02184891700744629,
|
| 145 |
+
"Simplifier": 0.01620769500732422,
|
| 146 |
+
"SimplifyMacroPredicates": 0.03200030326843262,
|
| 147 |
+
"SimplifyNeuronTensor": 0.09968447685241699,
|
| 148 |
+
"SimplifySlice": 0.002093076705932617,
|
| 149 |
+
"SimplifyTensor": 0.01188349723815918,
|
| 150 |
+
"SpillPSum": 0.06837248802185059,
|
| 151 |
+
"SplitAPUnionSets": 0.09830927848815918,
|
| 152 |
+
"SplitAccGrp": 0.003184795379638672,
|
| 153 |
+
"StaticProfiler": 0.024499177932739258,
|
| 154 |
+
"StaticTransposeLocalTensor": 0.013921499252319336,
|
| 155 |
+
"SundaISel": 0.12911200523376465,
|
| 156 |
+
"TCTransform": 0.01076197624206543,
|
| 157 |
+
"TensorInitialization": 0.015585660934448242,
|
| 158 |
+
"TensorOpSimplifier": 0.009182214736938477,
|
| 159 |
+
"TensorOpTransform": 0.02479076385498047,
|
| 160 |
+
"TensorizerLegalizationPass": 4.5000000682193786e-05,
|
| 161 |
+
"TileCCOps": 0.01529073715209961,
|
| 162 |
+
"TilingProfiler": 0.02448558807373047,
|
| 163 |
+
"TransformConvOp": 0.0032668113708496094,
|
| 164 |
+
"TritiumFusion": 0.07947993278503418,
|
| 165 |
+
"ValueNumbering": 0.008611917495727539,
|
| 166 |
+
"VectorizeDMA": 0.008882284164428711,
|
| 167 |
+
"VectorizeMatMult": 0.013601303100585938,
|
| 168 |
+
"VerifySupportedOps": 3.199999991920777e-05,
|
| 169 |
+
"WeightCoalescing": 0.014402627944946289,
|
| 170 |
+
"ZeroSizeTensorElimination": 0.00017452239990234375,
|
| 171 |
+
"algsimp": 0.001744000008329749,
|
| 172 |
+
"batchnorm_expander": 3.5000000934815034e-05,
|
| 173 |
+
"boundary-marker-removal": 1.1000000085914508e-05,
|
| 174 |
+
"call-inliner": 0.00022499999613501132,
|
| 175 |
+
"canonicalize-boundary-marker": 1.2999999853491317e-05,
|
| 176 |
+
"collective-stream-id-checker": 6.0999998822808266e-05,
|
| 177 |
+
"comparison-expander": 0.0004409999819472432,
|
| 178 |
+
"computation-deduplicator": 5.299999611452222e-05,
|
| 179 |
+
"config-lowering": 9.800000407267362e-05,
|
| 180 |
+
"constant-statistics": 0.0003980000037699938,
|
| 181 |
+
"constant_folding": 0.00015499998698942363,
|
| 182 |
+
"cse": 3.199999991920777e-05,
|
| 183 |
+
"dce": 4.099999932805076e-05,
|
| 184 |
+
"dot_decomposer": 0.0008870000019669533,
|
| 185 |
+
"dynamic-slice-transpose": 1.2000000424450263e-05,
|
| 186 |
+
"eliminate-redundant-compare": 0.0001379999885102734,
|
| 187 |
+
"emit-offloaded-dropout": 3.400000059627928e-05,
|
| 188 |
+
"flatten-call-graph": 0.0006670000148005784,
|
| 189 |
+
"fuse-send-recv": 5.299999611452222e-05,
|
| 190 |
+
"hilo-conditional-to-select": 1.2000000424450263e-05,
|
| 191 |
+
"hilo::LegalizeAlias": 1.1999999514955562e-05,
|
| 192 |
+
"hilo::NeuronInstCombine": 0.00015300000086426735,
|
| 193 |
+
"hilo::NeuronOpFusion": 2.9999999242136255e-05,
|
| 194 |
+
"hilo::ReplaceTokenTypeWithU8Pass": 3.099999958067201e-05,
|
| 195 |
+
"hilo::ScheduleFusion": 5.999999757477781e-06,
|
| 196 |
+
"hilo::SixtyFourHack": 6.500000017695129e-05,
|
| 197 |
+
"hilo::VerifyAliasing": 4.999999873689376e-06,
|
| 198 |
+
"hlo-mac-count": 0.01228100061416626,
|
| 199 |
+
"instruction-histogram": 0.0007319999858736992,
|
| 200 |
+
"io-con-pipe-begin": 4.999999873689376e-06,
|
| 201 |
+
"io-con-pipe-end": 9.999999974752427e-07,
|
| 202 |
+
"io-layout-normalization": 0.0008159999852068722,
|
| 203 |
+
"io-statistics": 3.899999865097925e-05,
|
| 204 |
+
"legalize-ccops-for-tensorizer": 3.000000106112566e-06,
|
| 205 |
+
"legalize-compare": 1.1000000085914508e-05,
|
| 206 |
+
"lower-argminmax-custom-call": 9.000000318337698e-06,
|
| 207 |
+
"map-inline": 0.0007249999907799065,
|
| 208 |
+
"metadata-naming": 4.400000034365803e-05,
|
| 209 |
+
"mlir::detail::OpToOpPassAdaptor": 6.800000119255856e-05,
|
| 210 |
+
"mlir::hlo::MhloToPyPenguin": 0.008609999902546406,
|
| 211 |
+
"mlir::mhlo::LowerComplexExtraPass": 0.000291000003926456,
|
| 212 |
+
"mlir::mhlo::LowerComplexPass": 0.0005230000242590904,
|
| 213 |
+
"native-to-custom-softmax": 0.0003209999995306134,
|
| 214 |
+
"native-to-custom-softmax-dx": 0.0004980000085197389,
|
| 215 |
+
"neuron-hlo-verifier": 0.010431000031530857,
|
| 216 |
+
"operand_upcaster": 4.400000034365803e-05,
|
| 217 |
+
"opt-barrier-removal": 0.0002589999930933118,
|
| 218 |
+
"post-par-pipe-begin": 5.999999757477781e-06,
|
| 219 |
+
"post-par-pipe-end": 0.0,
|
| 220 |
+
"post-partition-simplification": 0.0013230000622570515,
|
| 221 |
+
"pre-par-pipe-begin": 9.999999974752427e-07,
|
| 222 |
+
"pre-par-pipe-end": 0.0,
|
| 223 |
+
"pre-partition-simplification": 0.06850799918174744,
|
| 224 |
+
"replace-minimum-constant": 0.00036299999919719994,
|
| 225 |
+
"reshape-mover": 5.500000042957254e-05,
|
| 226 |
+
"simplify-concat": 0.00010000000474974513,
|
| 227 |
+
"simplify-while-loops": 5.0000002374872565e-05,
|
| 228 |
+
"transform-variadic-reduce": 5.8999998145736754e-05,
|
| 229 |
+
"tuple-simplifier": 0.00014600000577047467,
|
| 230 |
+
"unpack-nested-aws-ntwsr": 0.0002479999966453761,
|
| 231 |
+
"unroll-while-loop": 7.999999979801942e-06,
|
| 232 |
+
"zero_sized_hlo_elimination": 0.0007040000054985285
|
| 233 |
+
},
|
| 234 |
+
"hilo": {
|
| 235 |
+
"ConstantSize": 926335.0,
|
| 236 |
+
"HloInputCount": 371.0,
|
| 237 |
+
"HloMacCount": 26463305728.0,
|
| 238 |
+
"HloOutputCount": 57.0,
|
| 239 |
+
"IfmapSize": 3910916096.0,
|
| 240 |
+
"OfmapSize": 1879048192.0,
|
| 241 |
+
"OutputsReadFromCount": 0.0,
|
| 242 |
+
"PassthroughTensorsCount": 0.0,
|
| 243 |
+
"RedundantOutputCount": 0.0,
|
| 244 |
+
"Traffic": 886427776.0
|
| 245 |
+
},
|
| 246 |
+
"tensorizer": {
|
| 247 |
+
"DMATilingProfiler::TotalInstructionsAfterTiling": 22051.0,
|
| 248 |
+
"StaticProfiler::AifUb": 173.52798461914063,
|
| 249 |
+
"StaticProfiler::ArithmeticIntensityTensorizer": 150.2424774169922,
|
| 250 |
+
"StaticProfiler::AverageDmaLength": 2589.193359375,
|
| 251 |
+
"StaticProfiler::DDRTransferBytes": 407886880.0,
|
| 252 |
+
"StaticProfiler::InternalTransferBytes": 327079712.0,
|
| 253 |
+
"StaticProfiler::LoadExpanded": 89436.0,
|
| 254 |
+
"StaticProfiler::StoreExpanded": 2154.0,
|
| 255 |
+
"StaticProfiler::TotalDMAExpanded": 91590.0,
|
| 256 |
+
"StaticProfiler::TotalDynamicInstancesCount": 26447.0,
|
| 257 |
+
"StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 25996.0,
|
| 258 |
+
"StaticProfiler::TotalLNCComm": 0.0,
|
| 259 |
+
"StaticProfiler::TotalLNCCommTransfer": 0.0,
|
| 260 |
+
"TilingProfiler::BatchnormInstructionsAfterTiling": 0.0,
|
| 261 |
+
"TilingProfiler::DmaInstructionsAfterTiling": 0.0,
|
| 262 |
+
"TilingProfiler::GenericInstructionsAfterTiling": 4.0,
|
| 263 |
+
"TilingProfiler::MatMultInstructionsAfterTiling": 11424.0,
|
| 264 |
+
"TilingProfiler::NumPfTransposes": 6.0,
|
| 265 |
+
"TilingProfiler::NumPfTransposesForIo": 1.0,
|
| 266 |
+
"TilingProfiler::NumPfTransposesForLocal": 1.0,
|
| 267 |
+
"TilingProfiler::NumPfTransposesForNonlocal": 4.0,
|
| 268 |
+
"TilingProfiler::PfTransposeInstructions": 10291.0,
|
| 269 |
+
"TilingProfiler::PfTransposeInstructionsForIo": 9504.0,
|
| 270 |
+
"TilingProfiler::PfTransposeInstructionsForLocal": 1.0,
|
| 271 |
+
"TilingProfiler::PfTransposeInstructionsForNonlocal": 786.0,
|
| 272 |
+
"TilingProfiler::ReduceInstructionsAfterTiling": 4.0,
|
| 273 |
+
"TilingProfiler::SimdInstructionsAfterTiling": 164.0,
|
| 274 |
+
"TilingProfiler::TotalInstructionsAfterTiling": 0.0,
|
| 275 |
+
"TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0,
|
| 276 |
+
"TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0,
|
| 277 |
+
"TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0,
|
| 278 |
+
"TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0,
|
| 279 |
+
"TransformConvOp::conv2d_column_packing": 0.0,
|
| 280 |
+
"TransformConvOp::conv2d_column_packing_1": 0.0,
|
| 281 |
+
"TransformConvOp::conv2d_column_packing_io10": 0.0,
|
| 282 |
+
"TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0
|
| 283 |
+
}
|
| 284 |
+
},
|
| 285 |
+
"all": {
|
| 286 |
+
"compiletime": {
|
| 287 |
+
"algsimp": 0.0016029999824240804,
|
| 288 |
+
"call-inliner": 0.00019999999494757503,
|
| 289 |
+
"collective-stream-id-checker": 5.2999999752501026e-05,
|
| 290 |
+
"comparison-expander": 0.00042699999175965786,
|
| 291 |
+
"constant-statistics": 0.0003980000037699938,
|
| 292 |
+
"constant_folding": 0.0001340000017080456,
|
| 293 |
+
"dce": 3.7999998312443495e-05,
|
| 294 |
+
"dot_decomposer": 0.0008870000019669533,
|
| 295 |
+
"eliminate-redundant-compare": 0.0001289999927394092,
|
| 296 |
+
"flatten-call-graph": 0.0006440000142902136,
|
| 297 |
+
"hlo-mac-count": 0.007197000086307526,
|
| 298 |
+
"instruction-histogram": 0.0007319999858736992,
|
| 299 |
+
"io-con-pipe-begin": 4.999999873689376e-06,
|
| 300 |
+
"io-con-pipe-end": 9.999999974752427e-07,
|
| 301 |
+
"io-layout-normalization": 0.0008159999852068722,
|
| 302 |
+
"io-statistics": 3.899999865097925e-05,
|
| 303 |
+
"map-inline": 0.0006960000027902424,
|
| 304 |
+
"native-to-custom-softmax": 0.00030499999411404133,
|
| 305 |
+
"native-to-custom-softmax-dx": 0.00039000000106170774,
|
| 306 |
+
"neuron-hlo-verifier": 0.009362000040709972,
|
| 307 |
+
"opt-barrier-removal": 0.0002589999930933118,
|
| 308 |
+
"pre-par-pipe-begin": 9.999999974752427e-07,
|
| 309 |
+
"pre-par-pipe-end": 0.0,
|
| 310 |
+
"pre-partition-simplification": 0.06850799918174744,
|
| 311 |
+
"replace-minimum-constant": 0.00034500000765547156,
|
| 312 |
+
"reshape-mover": 4.8999998398358e-05,
|
| 313 |
+
"simplify-while-loops": 4.400000034365803e-05,
|
| 314 |
+
"tuple-simplifier": 0.0001340000017080456,
|
| 315 |
+
"unpack-nested-aws-ntwsr": 0.00023799999326001853,
|
| 316 |
+
"unroll-while-loop": 7.999999979801942e-06,
|
| 317 |
+
"zero_sized_hlo_elimination": 0.0007040000054985285
|
| 318 |
+
}
|
| 319 |
+
},
|
| 320 |
+
"attention_isa_kernel": {
|
| 321 |
+
"compiletime": {
|
| 322 |
+
"CoalesceCCOp": 0.00021982192993164063,
|
| 323 |
+
"DMALocalityOpt": 0.00021767616271972656,
|
| 324 |
+
"DMAProfiler": 0.0002532005310058594,
|
| 325 |
+
"DataStreaming": 0.00019359588623046875,
|
| 326 |
+
"DoNothing": 0.00017213821411132813,
|
| 327 |
+
"ExpandISAMacro": 0.00021219253540039063,
|
| 328 |
+
"FactorizeBlkDims": 0.0016205310821533203,
|
| 329 |
+
"InferPSumTensor": 0.00067901611328125,
|
| 330 |
+
"InferSharedMemLoc": 0.0005524158477783203,
|
| 331 |
+
"InsertCoreBarrier": 0.00033855438232421875,
|
| 332 |
+
"LateLegalizeInst": 0.00021457672119140625,
|
| 333 |
+
"LateNeuronInstComb": 0.00042700767517089844,
|
| 334 |
+
"LegalizeSundaAccess": 0.00022602081298828125,
|
| 335 |
+
"LegalizeType": 0.00026869773864746094,
|
| 336 |
+
"LowerBroadcast": 0.0002257823944091797,
|
| 337 |
+
"LowerIntrinsics": 0.0002770423889160156,
|
| 338 |
+
"LowerTranspose": 0.0002372264862060547,
|
| 339 |
+
"NeuronInstComb": 0.0004298686981201172,
|
| 340 |
+
"NeuronLICM": 0.00019097328186035156,
|
| 341 |
+
"NeuronSimplifyPredicates": 0.00029349327087402344,
|
| 342 |
+
"NeuronValueNumbering": 0.00023818016052246094,
|
| 343 |
+
"SFKVectorizer": 0.0022597312927246094,
|
| 344 |
+
"SimpleAllReduceTiling": 0.00019431114196777344,
|
| 345 |
+
"SimplifyNeuronTensor": 0.0004868507385253906,
|
| 346 |
+
"SpillPSum": 0.0006351470947265625,
|
| 347 |
+
"WeightCoalescing": 0.00022172927856445313
|
| 348 |
+
}
|
| 349 |
+
},
|
| 350 |
+
"cumsum": {
|
| 351 |
+
"compiletime": {
|
| 352 |
+
"CoalesceCCOp": 0.0003490447998046875,
|
| 353 |
+
"DMALocalityOpt": 0.00027871131896972656,
|
| 354 |
+
"DMAProfiler": 0.0013451576232910156,
|
| 355 |
+
"DataStreaming": 0.00047016143798828125,
|
| 356 |
+
"DoNothing": 0.0002353191375732422,
|
| 357 |
+
"ExpandISAMacro": 0.0008096694946289063,
|
| 358 |
+
"FactorizeBlkDims": 0.0007121562957763672,
|
| 359 |
+
"InferPSumTensor": 0.0026960372924804688,
|
| 360 |
+
"InferSharedMemLoc": 0.0007166862487792969,
|
| 361 |
+
"InsertCoreBarrier": 0.0004069805145263672,
|
| 362 |
+
"LateLegalizeInst": 0.0005886554718017578,
|
| 363 |
+
"LateNeuronInstComb": 0.002978801727294922,
|
| 364 |
+
"LegalizeSundaAccess": 0.003289461135864258,
|
| 365 |
+
"LegalizeType": 0.00041961669921875,
|
| 366 |
+
"LowerBroadcast": 0.0004119873046875,
|
| 367 |
+
"LowerIntrinsics": 0.0003657341003417969,
|
| 368 |
+
"LowerTranspose": 0.0004086494445800781,
|
| 369 |
+
"NeuronInstComb": 0.0012252330780029297,
|
| 370 |
+
"NeuronLICM": 0.0016541481018066406,
|
| 371 |
+
"NeuronSimplifyPredicates": 0.003880739212036133,
|
| 372 |
+
"NeuronValueNumbering": 0.0015976428985595703,
|
| 373 |
+
"SFKVectorizer": 0.005974292755126953,
|
| 374 |
+
"SimpleAllReduceTiling": 0.0007178783416748047,
|
| 375 |
+
"SimplifyNeuronTensor": 0.001119852066040039,
|
| 376 |
+
"SpillPSum": 0.003050565719604492,
|
| 377 |
+
"WeightCoalescing": 0.004181385040283203
|
| 378 |
+
}
|
| 379 |
+
},
|
| 380 |
+
"sg00": {
|
| 381 |
+
"compiletime": {
|
| 382 |
+
"CanonicalizeConv": 1.1000000085914508e-05,
|
| 383 |
+
"CanonicalizeForTensorizer": 1.4000000192027073e-05,
|
| 384 |
+
"Canonicalizer": 0.00028899998869746923,
|
| 385 |
+
"HoistCompute": 1.9999999949504854e-06,
|
| 386 |
+
"IdentifyCrossPassTensors": 1.5999999959603883e-05,
|
| 387 |
+
"MemcastMotion": 9.999999747378752e-06,
|
| 388 |
+
"PenguinizeFunctions": 1.4000000192027073e-05,
|
| 389 |
+
"PruneFunctions": 1.4000000192027073e-05,
|
| 390 |
+
"RemoveOptimizationBarriers": 2.099999983329326e-05,
|
| 391 |
+
"ScatterMotion": 9.000000318337698e-06,
|
| 392 |
+
"TensorizerLegalizationPass": 2.2000000171829015e-05,
|
| 393 |
+
"VerifySupportedOps": 9.999999747378752e-06,
|
| 394 |
+
"algsimp": 4.8000001697801054e-05,
|
| 395 |
+
"batchnorm_expander": 1.2000000424450263e-05,
|
| 396 |
+
"boundary-marker-removal": 3.999999989900971e-06,
|
| 397 |
+
"call-inliner": 7.000000096013537e-06,
|
| 398 |
+
"canonicalize-boundary-marker": 3.999999989900971e-06,
|
| 399 |
+
"collective-stream-id-checker": 1.9999999949504854e-06,
|
| 400 |
+
"comparison-expander": 3.999999989900971e-06,
|
| 401 |
+
"computation-deduplicator": 1.4999999621068127e-05,
|
| 402 |
+
"config-lowering": 3.400000059627928e-05,
|
| 403 |
+
"constant_folding": 7.000000096013537e-06,
|
| 404 |
+
"cse": 9.999999747378752e-06,
|
| 405 |
+
"dce": 9.999999974752427e-07,
|
| 406 |
+
"dynamic-slice-transpose": 3.999999989900971e-06,
|
| 407 |
+
"eliminate-redundant-compare": 3.000000106112566e-06,
|
| 408 |
+
"emit-offloaded-dropout": 1.2000000424450263e-05,
|
| 409 |
+
"flatten-call-graph": 7.000000096013537e-06,
|
| 410 |
+
"fuse-send-recv": 1.700000029813964e-05,
|
| 411 |
+
"hilo-conditional-to-select": 3.000000106112566e-06,
|
| 412 |
+
"hilo::LegalizeAlias": 4.999999873689376e-06,
|
| 413 |
+
"hilo::NeuronInstCombine": 5.700000110664405e-05,
|
| 414 |
+
"hilo::NeuronOpFusion": 1.4000000192027073e-05,
|
| 415 |
+
"hilo::ReplaceTokenTypeWithU8Pass": 1.2000000424450263e-05,
|
| 416 |
+
"hilo::ScheduleFusion": 9.999999974752427e-07,
|
| 417 |
+
"hilo::SixtyFourHack": 1.2000000424450263e-05,
|
| 418 |
+
"hilo::VerifyAliasing": 1.9999999949504854e-06,
|
| 419 |
+
"hlo-mac-count": 8.499999967170879e-05,
|
| 420 |
+
"legalize-ccops-for-tensorizer": 9.999999974752427e-07,
|
| 421 |
+
"legalize-compare": 3.000000106112566e-06,
|
| 422 |
+
"lower-argminmax-custom-call": 3.000000106112566e-06,
|
| 423 |
+
"map-inline": 9.000000318337698e-06,
|
| 424 |
+
"metadata-naming": 1.2000000424450263e-05,
|
| 425 |
+
"mlir::detail::OpToOpPassAdaptor": 2.300000051036477e-05,
|
| 426 |
+
"mlir::hlo::MhloToPyPenguin": 0.0016840000171214342,
|
| 427 |
+
"mlir::mhlo::LowerComplexExtraPass": 7.699999696342275e-05,
|
| 428 |
+
"mlir::mhlo::LowerComplexPass": 0.0001720000000204891,
|
| 429 |
+
"native-to-custom-softmax": 4.999999873689376e-06,
|
| 430 |
+
"native-to-custom-softmax-dx": 7.200000254670158e-05,
|
| 431 |
+
"neuron-hlo-verifier": 0.000371000001905486,
|
| 432 |
+
"operand_upcaster": 1.4000000192027073e-05,
|
| 433 |
+
"post-par-pipe-begin": 9.999999974752427e-07,
|
| 434 |
+
"post-par-pipe-end": 0.0,
|
| 435 |
+
"post-partition-simplification": 0.00043399998685345054,
|
| 436 |
+
"replace-minimum-constant": 6.000000212225132e-06,
|
| 437 |
+
"reshape-mover": 1.9999999949504854e-06,
|
| 438 |
+
"simplify-concat": 3.300000025774352e-05,
|
| 439 |
+
"simplify-while-loops": 1.9999999949504854e-06,
|
| 440 |
+
"transform-variadic-reduce": 7.000000096013537e-06,
|
| 441 |
+
"tuple-simplifier": 3.999999989900971e-06,
|
| 442 |
+
"unpack-nested-aws-ntwsr": 3.000000106112566e-06,
|
| 443 |
+
"unroll-while-loop": 0.0
|
| 444 |
+
},
|
| 445 |
+
"hilo": {
|
| 446 |
+
"ArithmeticIntensity": 17.4229793548584,
|
| 447 |
+
"ConstantSize": 926335.0,
|
| 448 |
+
"HloInputCount": 371.0,
|
| 449 |
+
"HloMacCount": 3489660928.0,
|
| 450 |
+
"HloOutputCount": 57.0,
|
| 451 |
+
"IfmapSize": 3910916096.0,
|
| 452 |
+
"OfmapSize": 1879048192.0,
|
| 453 |
+
"OutputsReadFromCount": 0.0,
|
| 454 |
+
"PassthroughTensorsCount": 0.0,
|
| 455 |
+
"RedundantOutputCount": 0.0,
|
| 456 |
+
"Traffic": 400581408.0
|
| 457 |
+
}
|
| 458 |
+
},
|
| 459 |
+
"sg0000": {
|
| 460 |
+
"compiletime": {
|
| 461 |
+
"AGOrderingAnalysisPass": 0.07508444786071777,
|
| 462 |
+
"AffinePredicateResolution": 0.0010340213775634766,
|
| 463 |
+
"AliasDependencyElimination": 0.0002384185791015625,
|
| 464 |
+
"AliasDependencyInduction": 0.007371425628662109,
|
| 465 |
+
"AliasDependencyReset": 0.0582888126373291,
|
| 466 |
+
"BFComputeCutting": 0.013819217681884766,
|
| 467 |
+
"BirCodeGenLoop": 0.06449317932128906,
|
| 468 |
+
"CCOpFusion": 0.04928326606750488,
|
| 469 |
+
"CanonicalizeDAGForPGTiling": 0.0076160430908203125,
|
| 470 |
+
"CanonicalizeIR": 0.0027213096618652344,
|
| 471 |
+
"CoalesceCCOp": 0.007978439331054688,
|
| 472 |
+
"CommuteConcat": 0.002101421356201172,
|
| 473 |
+
"DMALocalityOpt": 0.005911350250244141,
|
| 474 |
+
"DMAProfiler": 0.011723995208740234,
|
| 475 |
+
"DMATilingProfiler": 0.0077321529388427734,
|
| 476 |
+
"DataLocalityOpt": 0.20074963569641113,
|
| 477 |
+
"DataStreaming": 0.012155294418334961,
|
| 478 |
+
"DeConcat": 0.00474858283996582,
|
| 479 |
+
"DeadCodeElimination": 0.002126932144165039,
|
| 480 |
+
"DeadStoreElimination": 0.044701576232910156,
|
| 481 |
+
"DelinearIndices": 0.019860267639160156,
|
| 482 |
+
"Delinearization": 0.006117343902587891,
|
| 483 |
+
"DelinearizeSPMD": 0.04185628890991211,
|
| 484 |
+
"DoNothing": 9.918212890625e-05,
|
| 485 |
+
"DramToDramTranspose": 0.017105817794799805,
|
| 486 |
+
"DumpGraphAndMetadata": 0.0168914794921875,
|
| 487 |
+
"EliminateDivs": 0.0026845932006835938,
|
| 488 |
+
"ExpandBatchNorm": 0.0020225048065185547,
|
| 489 |
+
"ExpandISAMacro": 0.007347822189331055,
|
| 490 |
+
"FactorizeBlkDims": 0.05445575714111328,
|
| 491 |
+
"FactorizeThreadAxesInFreeDims": 0.004782199859619141,
|
| 492 |
+
"FlattenMacroLoop": 0.012040138244628906,
|
| 493 |
+
"GenericAccessSimplifier": 0.001428365707397461,
|
| 494 |
+
"InferInitValue": 0.08275437355041504,
|
| 495 |
+
"InferIntrinsicOnCC": 0.016964197158813477,
|
| 496 |
+
"InferNeuronTensor": 0.0713052749633789,
|
| 497 |
+
"InferNonlocalTensors": 0.17369747161865234,
|
| 498 |
+
"InferPSumTensor": 0.07679295539855957,
|
| 499 |
+
"InferShardAxis": 0.5430936813354492,
|
| 500 |
+
"InferSharedMemLoc": 0.0051038265228271484,
|
| 501 |
+
"InlineNativeKernels": 0.005239963531494141,
|
| 502 |
+
"InsertCoreBarrier": 0.008324384689331055,
|
| 503 |
+
"InsertIOTransposes": 0.038658857345581055,
|
| 504 |
+
"InsertImplicitShardAxisBeforeISel": 0.009135007858276367,
|
| 505 |
+
"InsertLocalTransposes": 0.029627084732055664,
|
| 506 |
+
"InsertOffloadedTransposes": 0.019885540008544922,
|
| 507 |
+
"LICM": 0.0056383609771728516,
|
| 508 |
+
"LateLegalizeInst": 0.011803150177001953,
|
| 509 |
+
"LateLegalizePostSplit": 0.005868196487426758,
|
| 510 |
+
"LateLowerReshapeOp": 0.007382631301879883,
|
| 511 |
+
"LateLowerTensorOp": 0.004155397415161133,
|
| 512 |
+
"LateNeuronInstComb": 0.0334017276763916,
|
| 513 |
+
"LayoutPreprocessing": 0.25243687629699707,
|
| 514 |
+
"LayoutPreprocessingAndAnalysis": 0.30139756202697754,
|
| 515 |
+
"LayoutRequirementAnalysis": 0.014056921005249023,
|
| 516 |
+
"LegalizeCCOpLayout": 0.0020928382873535156,
|
| 517 |
+
"LegalizeOpLevelAlias": 0.0016238689422607422,
|
| 518 |
+
"LegalizePartitionReduce": 0.0030252933502197266,
|
| 519 |
+
"LegalizeSundaAccess": 0.05711483955383301,
|
| 520 |
+
"LegalizeSundaMacro": 0.023845911026000977,
|
| 521 |
+
"LegalizeType": 0.00843501091003418,
|
| 522 |
+
"LocalLayoutOpt": 0.11445784568786621,
|
| 523 |
+
"LoopFusion": 0.01024007797241211,
|
| 524 |
+
"LoopSplitting": 0.0017781257629394531,
|
| 525 |
+
"LowerBroadcast": 0.0037119388580322266,
|
| 526 |
+
"LowerCCOpBlockAxis": 0.014172077178955078,
|
| 527 |
+
"LowerComplexBroadcast": 0.004027366638183594,
|
| 528 |
+
"LowerIntrinsics": 0.03793048858642578,
|
| 529 |
+
"LowerShardAxis": 0.012651443481445313,
|
| 530 |
+
"LowerTensorOp": 0.01001119613647461,
|
| 531 |
+
"LowerToSendRecv": 0.005930900573730469,
|
| 532 |
+
"LowerTranspose": 0.018492937088012695,
|
| 533 |
+
"MacroGeneration": 0.11934685707092285,
|
| 534 |
+
"MaskPropagation": 0.005895137786865234,
|
| 535 |
+
"MemcpyElimination": 0.09257030487060547,
|
| 536 |
+
"MutateDataType": 0.0017631053924560547,
|
| 537 |
+
"NeuronAliasDependencyInduction": 0.0007777214050292969,
|
| 538 |
+
"NeuronAliasDependencyReset": 0.03222823143005371,
|
| 539 |
+
"NeuronInstComb": 0.02764892578125,
|
| 540 |
+
"NeuronLICM": 0.015506982803344727,
|
| 541 |
+
"NeuronLoopFusion": 0.0383763313293457,
|
| 542 |
+
"NeuronLoopInterchange": 0.010429620742797852,
|
| 543 |
+
"NeuronSimplifier": 0.033356666564941406,
|
| 544 |
+
"NeuronSimplifyPredicates": 0.006680965423583984,
|
| 545 |
+
"NeuronValueNumbering": 0.019241809844970703,
|
| 546 |
+
"OptimizeAliasedCopyChain": 0.0010235309600830078,
|
| 547 |
+
"OptimizeNKIKernels": 0.45916128158569336,
|
| 548 |
+
"PAGLayoutOpt": 0.7117609977722168,
|
| 549 |
+
"PComputeCutting": 0.020105838775634766,
|
| 550 |
+
"PGLayoutTilingPipeline": 2.928948163986206,
|
| 551 |
+
"PGTiling": 0.39027953147888184,
|
| 552 |
+
"PadElimination": 0.0007317066192626953,
|
| 553 |
+
"ParAxesAnnotation": 0.6492185592651367,
|
| 554 |
+
"PartialLoopFusion": 0.0445561408996582,
|
| 555 |
+
"PartialSimdFusion": 0.039563655853271484,
|
| 556 |
+
"PerfectLoopNest": 0.0034646987915039063,
|
| 557 |
+
"RecognizeOpIdiom": 0.016507387161254883,
|
| 558 |
+
"Recompute": 0.0003933906555175781,
|
| 559 |
+
"RelaxPredicates": 0.005345582962036133,
|
| 560 |
+
"Rematerialization": 0.005880117416381836,
|
| 561 |
+
"RemoveShardedPartitionAxes": 0.03753328323364258,
|
| 562 |
+
"ReshapeWeights": 0.002991914749145508,
|
| 563 |
+
"ResolveAccessConflict": 0.0245821475982666,
|
| 564 |
+
"ResolveComplicatePredicates": 0.0018818378448486328,
|
| 565 |
+
"RewriteReplicationMatmul": 0.0024051666259765625,
|
| 566 |
+
"RewriteWeights": 0.006072998046875,
|
| 567 |
+
"SFKVectorizer": 0.49936652183532715,
|
| 568 |
+
"ShardingPropagationAnalysis": 0.03256559371948242,
|
| 569 |
+
"SimpleAllReduceTiling": 0.0036296844482421875,
|
| 570 |
+
"Simplifier": 0.007125377655029297,
|
| 571 |
+
"SimplifyMacroPredicates": 0.02839207649230957,
|
| 572 |
+
"SimplifyNeuronTensor": 0.021625995635986328,
|
| 573 |
+
"SimplifySlice": 0.0024862289428710938,
|
| 574 |
+
"SimplifyTensor": 0.033231496810913086,
|
| 575 |
+
"SpillPSum": 0.034162282943725586,
|
| 576 |
+
"SplitAPUnionSets": 0.042994022369384766,
|
| 577 |
+
"SplitAccGrp": 0.00764918327331543,
|
| 578 |
+
"StaticProfiler": 0.008186817169189453,
|
| 579 |
+
"StaticTransposeLocalTensor": 0.007767438888549805,
|
| 580 |
+
"SundaISel": 0.05960273742675781,
|
| 581 |
+
"TCTransform": 0.00103759765625,
|
| 582 |
+
"TensorInitialization": 0.007684469223022461,
|
| 583 |
+
"TensorOpSimplifier": 0.006952047348022461,
|
| 584 |
+
"TensorOpTransform": 0.030390501022338867,
|
| 585 |
+
"TileCCOps": 0.006802797317504883,
|
| 586 |
+
"TilingProfiler": 0.040956735610961914,
|
| 587 |
+
"TransformConvOp": 0.0029840469360351563,
|
| 588 |
+
"TritiumFusion": 0.03676962852478027,
|
| 589 |
+
"ValueNumbering": 0.0034532546997070313,
|
| 590 |
+
"VectorizeDMA": 0.005709171295166016,
|
| 591 |
+
"VectorizeMatMult": 0.030527591705322266,
|
| 592 |
+
"WeightCoalescing": 0.0040700435638427734,
|
| 593 |
+
"ZeroSizeTensorElimination": 0.0002455711364746094
|
| 594 |
+
},
|
| 595 |
+
"tensorizer": {
|
| 596 |
+
"DMATilingProfiler::TotalInstructionsAfterTiling": 1174.0,
|
| 597 |
+
"StaticProfiler::AifUb": 16.874553680419922,
|
| 598 |
+
"StaticProfiler::ArithmeticIntensityTensorizer": 204.6156768798828,
|
| 599 |
+
"StaticProfiler::AverageDmaLength": 1413.5869140625,
|
| 600 |
+
"StaticProfiler::AverageFractalPeUtilization": 99.77033233642578,
|
| 601 |
+
"StaticProfiler::AveragePartitionUtilization": 99.01372528076172,
|
| 602 |
+
"StaticProfiler::AveragePeUtilization": 99.29181671142578,
|
| 603 |
+
"StaticProfiler::DDRTransferBytes": 38148616.0,
|
| 604 |
+
"StaticProfiler::InternalTransferBytes": 22941696.0,
|
| 605 |
+
"StaticProfiler::LoadExpanded": 12553.0,
|
| 606 |
+
"StaticProfiler::LocalizationEfficiency": 1212.5694580078125,
|
| 607 |
+
"StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 1809.3712158203125,
|
| 608 |
+
"StaticProfiler::StoreExpanded": 8193.0,
|
| 609 |
+
"StaticProfiler::TotalDMAExpanded": 20746.0,
|
| 610 |
+
"StaticProfiler::TotalDynamicInstancesCount": 1510.0,
|
| 611 |
+
"StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 1506.0,
|
| 612 |
+
"StaticProfiler::TotalLNCComm": 0.0,
|
| 613 |
+
"StaticProfiler::TotalLNCCommTransfer": 0.0,
|
| 614 |
+
"TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0,
|
| 615 |
+
"TilingProfiler::AveragePeUtilizationAfterTiling": 0.0,
|
| 616 |
+
"TilingProfiler::BatchnormInstructionsAfterTiling": 0.0,
|
| 617 |
+
"TilingProfiler::DmaInstructionsAfterTiling": 0.0,
|
| 618 |
+
"TilingProfiler::GenericInstructionsAfterTiling": 40.0,
|
| 619 |
+
"TilingProfiler::MatMultInstructionsAfterTiling": 644.0,
|
| 620 |
+
"TilingProfiler::NumPfTransposes": 7.0,
|
| 621 |
+
"TilingProfiler::NumPfTransposesForIo": 2.0,
|
| 622 |
+
"TilingProfiler::NumPfTransposesForLocal": 2.0,
|
| 623 |
+
"TilingProfiler::NumPfTransposesForNonlocal": 3.0,
|
| 624 |
+
"TilingProfiler::PfTransposeInstructions": 209.0,
|
| 625 |
+
"TilingProfiler::PfTransposeInstructionsForIo": 65.0,
|
| 626 |
+
"TilingProfiler::PfTransposeInstructionsForLocal": 48.0,
|
| 627 |
+
"TilingProfiler::PfTransposeInstructionsForNonlocal": 96.0,
|
| 628 |
+
"TilingProfiler::ReduceInstructionsAfterTiling": 0.0,
|
| 629 |
+
"TilingProfiler::SimdInstructionsAfterTiling": 136.0,
|
| 630 |
+
"TilingProfiler::TotalInstructionsAfterTiling": 0.0,
|
| 631 |
+
"TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0,
|
| 632 |
+
"TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0,
|
| 633 |
+
"TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0,
|
| 634 |
+
"TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0,
|
| 635 |
+
"TransformConvOp::conv2d_column_packing": 0.0,
|
| 636 |
+
"TransformConvOp::conv2d_column_packing_1": 0.0,
|
| 637 |
+
"TransformConvOp::conv2d_column_packing_io10": 0.0,
|
| 638 |
+
"TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0
|
| 639 |
+
}
|
| 640 |
+
},
|
| 641 |
+
"sg0001": {
|
| 642 |
+
"compiletime": {
|
| 643 |
+
"AGOrderingAnalysisPass": 0.08034706115722656,
|
| 644 |
+
"AffinePredicateResolution": 0.0021657943725585938,
|
| 645 |
+
"AliasDependencyElimination": 0.0002224445343017578,
|
| 646 |
+
"AliasDependencyInduction": 0.006604909896850586,
|
| 647 |
+
"AliasDependencyReset": 0.028621673583984375,
|
| 648 |
+
"BFComputeCutting": 0.006361484527587891,
|
| 649 |
+
"BirCodeGenLoop": 0.043970584869384766,
|
| 650 |
+
"CCOpFusion": 0.03917193412780762,
|
| 651 |
+
"CanonicalizeDAGForPGTiling": 0.015412569046020508,
|
| 652 |
+
"CanonicalizeIR": 0.0026285648345947266,
|
| 653 |
+
"CoalesceCCOp": 0.019171714782714844,
|
| 654 |
+
"CommuteConcat": 0.0022630691528320313,
|
| 655 |
+
"DMALocalityOpt": 0.0018835067749023438,
|
| 656 |
+
"DMAProfiler": 0.015621662139892578,
|
| 657 |
+
"DMATilingProfiler": 0.007387399673461914,
|
| 658 |
+
"DataLocalityOpt": 0.3166489601135254,
|
| 659 |
+
"DataStreaming": 0.008202552795410156,
|
| 660 |
+
"DeConcat": 0.0027625560760498047,
|
| 661 |
+
"DeadCodeElimination": 0.008514642715454102,
|
| 662 |
+
"DeadStoreElimination": 0.02995467185974121,
|
| 663 |
+
"DelinearIndices": 0.020328283309936523,
|
| 664 |
+
"Delinearization": 0.008889198303222656,
|
| 665 |
+
"DelinearizeSPMD": 0.025659799575805664,
|
| 666 |
+
"DoNothing": 9.298324584960938e-05,
|
| 667 |
+
"DramToDramTranspose": 0.013378381729125977,
|
| 668 |
+
"DumpGraphAndMetadata": 0.011143684387207031,
|
| 669 |
+
"EliminateDivs": 0.006491422653198242,
|
| 670 |
+
"ExpandBatchNorm": 0.0015842914581298828,
|
| 671 |
+
"ExpandISAMacro": 0.014866113662719727,
|
| 672 |
+
"FactorizeBlkDims": 0.02399158477783203,
|
| 673 |
+
"FactorizeThreadAxesInFreeDims": 0.008170843124389648,
|
| 674 |
+
"FlattenMacroLoop": 0.013584375381469727,
|
| 675 |
+
"GenericAccessSimplifier": 0.0016484260559082031,
|
| 676 |
+
"InferInitValue": 0.09902763366699219,
|
| 677 |
+
"InferIntrinsicOnCC": 0.05336475372314453,
|
| 678 |
+
"InferNeuronTensor": 0.0689244270324707,
|
| 679 |
+
"InferNonlocalTensors": 0.0623164176940918,
|
| 680 |
+
"InferPSumTensor": 0.06397223472595215,
|
| 681 |
+
"InferShardAxis": 0.7081491947174072,
|
| 682 |
+
"InferSharedMemLoc": 0.008078813552856445,
|
| 683 |
+
"InlineNativeKernels": 0.002736806869506836,
|
| 684 |
+
"InsertCoreBarrier": 0.008532524108886719,
|
| 685 |
+
"InsertIOTransposes": 0.04539895057678223,
|
| 686 |
+
"InsertImplicitShardAxisBeforeISel": 0.011088132858276367,
|
| 687 |
+
"InsertLocalTransposes": 0.008382081985473633,
|
| 688 |
+
"InsertOffloadedTransposes": 0.009244203567504883,
|
| 689 |
+
"LICM": 0.0059854984283447266,
|
| 690 |
+
"LateLegalizeInst": 0.012192487716674805,
|
| 691 |
+
"LateLegalizePostSplit": 0.004922151565551758,
|
| 692 |
+
"LateLowerReshapeOp": 0.0050048828125,
|
| 693 |
+
"LateLowerTensorOp": 0.00384521484375,
|
| 694 |
+
"LateNeuronInstComb": 0.01603221893310547,
|
| 695 |
+
"LayoutPreprocessing": 0.083892822265625,
|
| 696 |
+
"LayoutPreprocessingAndAnalysis": 0.14038705825805664,
|
| 697 |
+
"LayoutRequirementAnalysis": 0.026170969009399414,
|
| 698 |
+
"LegalizeCCOpLayout": 0.0018677711486816406,
|
| 699 |
+
"LegalizeOpLevelAlias": 0.0019845962524414063,
|
| 700 |
+
"LegalizePartitionReduce": 0.002770662307739258,
|
| 701 |
+
"LegalizeSundaAccess": 0.02824854850769043,
|
| 702 |
+
"LegalizeSundaMacro": 0.025277376174926758,
|
| 703 |
+
"LegalizeType": 0.005255222320556641,
|
| 704 |
+
"LocalLayoutOpt": 0.1487877368927002,
|
| 705 |
+
"LoopFusion": 0.009909629821777344,
|
| 706 |
+
"LoopSplitting": 0.004529237747192383,
|
| 707 |
+
"LowerBroadcast": 0.0027620792388916016,
|
| 708 |
+
"LowerCCOpBlockAxis": 0.012650728225708008,
|
| 709 |
+
"LowerComplexBroadcast": 0.015005111694335938,
|
| 710 |
+
"LowerIntrinsics": 0.03992509841918945,
|
| 711 |
+
"LowerShardAxis": 0.01078486442565918,
|
| 712 |
+
"LowerTensorOp": 0.010359048843383789,
|
| 713 |
+
"LowerToSendRecv": 0.010585546493530273,
|
| 714 |
+
"LowerTranspose": 0.024251461029052734,
|
| 715 |
+
"MacroGeneration": 0.17415404319763184,
|
| 716 |
+
"MaskPropagation": 0.009861946105957031,
|
| 717 |
+
"MemcpyElimination": 0.08973836898803711,
|
| 718 |
+
"MutateDataType": 0.0023250579833984375,
|
| 719 |
+
"NeuronAliasDependencyInduction": 0.0036211013793945313,
|
| 720 |
+
"NeuronAliasDependencyReset": 0.03322243690490723,
|
| 721 |
+
"NeuronInstComb": 0.027010679244995117,
|
| 722 |
+
"NeuronLICM": 0.014135122299194336,
|
| 723 |
+
"NeuronLoopFusion": 0.0790092945098877,
|
| 724 |
+
"NeuronLoopInterchange": 0.006104946136474609,
|
| 725 |
+
"NeuronSimplifier": 0.02999567985534668,
|
| 726 |
+
"NeuronSimplifyPredicates": 0.0038328170776367188,
|
| 727 |
+
"NeuronValueNumbering": 0.016868114471435547,
|
| 728 |
+
"OptimizeAliasedCopyChain": 0.0012192726135253906,
|
| 729 |
+
"OptimizeNKIKernels": 0.4351818561553955,
|
| 730 |
+
"PAGLayoutOpt": 0.3483104705810547,
|
| 731 |
+
"PComputeCutting": 0.02324676513671875,
|
| 732 |
+
"PGLayoutTilingPipeline": 2.0860910415649414,
|
| 733 |
+
"PGTiling": 0.4031491279602051,
|
| 734 |
+
"PadElimination": 0.000728607177734375,
|
| 735 |
+
"ParAxesAnnotation": 0.30509090423583984,
|
| 736 |
+
"PartialLoopFusion": 0.06583142280578613,
|
| 737 |
+
"PartialSimdFusion": 0.1207880973815918,
|
| 738 |
+
"PerfectLoopNest": 0.010277032852172852,
|
| 739 |
+
"RecognizeOpIdiom": 0.004372358322143555,
|
| 740 |
+
"Recompute": 0.00031304359436035156,
|
| 741 |
+
"RelaxPredicates": 0.005488395690917969,
|
| 742 |
+
"Rematerialization": 0.0020155906677246094,
|
| 743 |
+
"RemoveShardedPartitionAxes": 0.026065587997436523,
|
| 744 |
+
"ReshapeWeights": 0.0033690929412841797,
|
| 745 |
+
"ResolveAccessConflict": 0.011795282363891602,
|
| 746 |
+
"ResolveComplicatePredicates": 0.005822658538818359,
|
| 747 |
+
"RewriteReplicationMatmul": 0.004129886627197266,
|
| 748 |
+
"RewriteWeights": 0.012514114379882813,
|
| 749 |
+
"SFKVectorizer": 0.3114356994628906,
|
| 750 |
+
"ShardingPropagationAnalysis": 0.03329586982727051,
|
| 751 |
+
"SimpleAllReduceTiling": 0.003468751907348633,
|
| 752 |
+
"Simplifier": 0.007978200912475586,
|
| 753 |
+
"SimplifyMacroPredicates": 0.01414942741394043,
|
| 754 |
+
"SimplifyNeuronTensor": 0.018707275390625,
|
| 755 |
+
"SimplifySlice": 0.0030634403228759766,
|
| 756 |
+
"SimplifyTensor": 0.028036117553710938,
|
| 757 |
+
"SpillPSum": 0.02836132049560547,
|
| 758 |
+
"SplitAPUnionSets": 0.028769254684448242,
|
| 759 |
+
"SplitAccGrp": 0.002518892288208008,
|
| 760 |
+
"StaticProfiler": 0.012613058090209961,
|
| 761 |
+
"StaticTransposeLocalTensor": 0.014979124069213867,
|
| 762 |
+
"SundaISel": 0.06619906425476074,
|
| 763 |
+
"TCTransform": 0.0018546581268310547,
|
| 764 |
+
"TensorInitialization": 0.0047528743743896484,
|
| 765 |
+
"TensorOpSimplifier": 0.006958484649658203,
|
| 766 |
+
"TensorOpTransform": 0.0394289493560791,
|
| 767 |
+
"TileCCOps": 0.03006148338317871,
|
| 768 |
+
"TilingProfiler": 0.020921945571899414,
|
| 769 |
+
"TransformConvOp": 0.0030717849731445313,
|
| 770 |
+
"TritiumFusion": 0.10711383819580078,
|
| 771 |
+
"ValueNumbering": 0.002644777297973633,
|
| 772 |
+
"VectorizeDMA": 0.009524345397949219,
|
| 773 |
+
"VectorizeMatMult": 0.04689669609069824,
|
| 774 |
+
"WeightCoalescing": 0.004178285598754883,
|
| 775 |
+
"ZeroSizeTensorElimination": 0.00014138221740722656
|
| 776 |
+
},
|
| 777 |
+
"tensorizer": {
|
| 778 |
+
"DMATilingProfiler::TotalInstructionsAfterTiling": 3307.0,
|
| 779 |
+
"StaticProfiler::AifUb": 142.25091552734375,
|
| 780 |
+
"StaticProfiler::ArithmeticIntensityTensorizer": 232.9062957763672,
|
| 781 |
+
"StaticProfiler::AverageDmaLength": 3958.823974609375,
|
| 782 |
+
"StaticProfiler::AverageFractalPeUtilization": 100.0,
|
| 783 |
+
"StaticProfiler::AveragePartitionUtilization": 99.65841674804688,
|
| 784 |
+
"StaticProfiler::AveragePeUtilization": 100.0,
|
| 785 |
+
"StaticProfiler::DDRTransferBytes": 118065160.0,
|
| 786 |
+
"StaticProfiler::InternalTransferBytes": 19660800.0,
|
| 787 |
+
"StaticProfiler::LoadExpanded": 17025.0,
|
| 788 |
+
"StaticProfiler::LocalizationEfficiency": 163.7292022705078,
|
| 789 |
+
"StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 185.10040283203125,
|
| 790 |
+
"StaticProfiler::StoreExpanded": 7937.0,
|
| 791 |
+
"StaticProfiler::TotalDMAExpanded": 24962.0,
|
| 792 |
+
"StaticProfiler::TotalDynamicInstancesCount": 3517.0,
|
| 793 |
+
"StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 3517.0,
|
| 794 |
+
"StaticProfiler::TotalLNCComm": 0.0,
|
| 795 |
+
"StaticProfiler::TotalLNCCommTransfer": 0.0,
|
| 796 |
+
"TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0,
|
| 797 |
+
"TilingProfiler::AveragePeUtilizationAfterTiling": 0.0,
|
| 798 |
+
"TilingProfiler::BatchnormInstructionsAfterTiling": 0.0,
|
| 799 |
+
"TilingProfiler::DmaInstructionsAfterTiling": 0.0,
|
| 800 |
+
"TilingProfiler::GenericInstructionsAfterTiling": 32.0,
|
| 801 |
+
"TilingProfiler::MatMultInstructionsAfterTiling": 2560.0,
|
| 802 |
+
"TilingProfiler::NumPfTransposes": 7.0,
|
| 803 |
+
"TilingProfiler::NumPfTransposesForIo": 3.0,
|
| 804 |
+
"TilingProfiler::NumPfTransposesForLocal": 2.0,
|
| 805 |
+
"TilingProfiler::NumPfTransposesForNonlocal": 2.0,
|
| 806 |
+
"TilingProfiler::PfTransposeInstructions": 232.0,
|
| 807 |
+
"TilingProfiler::PfTransposeInstructionsForIo": 72.0,
|
| 808 |
+
"TilingProfiler::PfTransposeInstructionsForLocal": 32.0,
|
| 809 |
+
"TilingProfiler::PfTransposeInstructionsForNonlocal": 128.0,
|
| 810 |
+
"TilingProfiler::ReduceInstructionsAfterTiling": 0.0,
|
| 811 |
+
"TilingProfiler::SimdInstructionsAfterTiling": 211.0,
|
| 812 |
+
"TilingProfiler::TotalInstructionsAfterTiling": 0.0,
|
| 813 |
+
"TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0,
|
| 814 |
+
"TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0,
|
| 815 |
+
"TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0,
|
| 816 |
+
"TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0,
|
| 817 |
+
"TransformConvOp::conv2d_column_packing": 0.0,
|
| 818 |
+
"TransformConvOp::conv2d_column_packing_1": 0.0,
|
| 819 |
+
"TransformConvOp::conv2d_column_packing_io10": 0.0,
|
| 820 |
+
"TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0
|
| 821 |
+
}
|
| 822 |
+
},
|
| 823 |
+
"sg0002": {
|
| 824 |
+
"compiletime": {
|
| 825 |
+
"AGOrderingAnalysisPass": 0.07081985473632813,
|
| 826 |
+
"AffinePredicateResolution": 0.001847982406616211,
|
| 827 |
+
"AliasDependencyElimination": 0.0017039775848388672,
|
| 828 |
+
"AliasDependencyInduction": 0.016176223754882813,
|
| 829 |
+
"AliasDependencyReset": 0.0533907413482666,
|
| 830 |
+
"BFComputeCutting": 0.002690553665161133,
|
| 831 |
+
"BirCodeGenLoop": 0.436786413192749,
|
| 832 |
+
"CCOpFusion": 0.05509161949157715,
|
| 833 |
+
"CanonicalizeDAGForPGTiling": 0.01196432113647461,
|
| 834 |
+
"CanonicalizeIR": 0.002866029739379883,
|
| 835 |
+
"CoalesceCCOp": 0.00784611701965332,
|
| 836 |
+
"CommuteConcat": 0.0016961097717285156,
|
| 837 |
+
"DMALocalityOpt": 0.006368398666381836,
|
| 838 |
+
"DMAProfiler": 0.016033411026000977,
|
| 839 |
+
"DMATilingProfiler": 0.013326406478881836,
|
| 840 |
+
"DataLocalityOpt": 0.13399314880371094,
|
| 841 |
+
"DataStreaming": 0.005326271057128906,
|
| 842 |
+
"DeConcat": 0.003023386001586914,
|
| 843 |
+
"DeadCodeElimination": 0.006216287612915039,
|
| 844 |
+
"DeadStoreElimination": 0.01400136947631836,
|
| 845 |
+
"DelinearIndices": 0.014129638671875,
|
| 846 |
+
"Delinearization": 0.004580259323120117,
|
| 847 |
+
"DelinearizeSPMD": 0.02204442024230957,
|
| 848 |
+
"DoNothing": 6.771087646484375e-05,
|
| 849 |
+
"DramToDramTranspose": 0.0199737548828125,
|
| 850 |
+
"DumpGraphAndMetadata": 0.037271738052368164,
|
| 851 |
+
"EliminateDivs": 0.0025110244750976563,
|
| 852 |
+
"ExpandBatchNorm": 0.002251148223876953,
|
| 853 |
+
"ExpandISAMacro": 0.0057184696197509766,
|
| 854 |
+
"FactorizeBlkDims": 0.020665884017944336,
|
| 855 |
+
"FactorizeThreadAxesInFreeDims": 0.0031156539916992188,
|
| 856 |
+
"FlattenMacroLoop": 0.005499601364135742,
|
| 857 |
+
"GenericAccessSimplifier": 0.004717350006103516,
|
| 858 |
+
"InferInitValue": 0.046659231185913086,
|
| 859 |
+
"InferIntrinsicOnCC": 0.039793968200683594,
|
| 860 |
+
"InferNeuronTensor": 0.03774452209472656,
|
| 861 |
+
"InferNonlocalTensors": 0.030941486358642578,
|
| 862 |
+
"InferPSumTensor": 0.10350608825683594,
|
| 863 |
+
"InferShardAxis": 0.504509449005127,
|
| 864 |
+
"InferSharedMemLoc": 0.021315813064575195,
|
| 865 |
+
"InlineNativeKernels": 0.00193023681640625,
|
| 866 |
+
"InsertCoreBarrier": 0.008482217788696289,
|
| 867 |
+
"InsertIOTransposes": 0.061508893966674805,
|
| 868 |
+
"InsertImplicitShardAxisBeforeISel": 0.01612401008605957,
|
| 869 |
+
"InsertLocalTransposes": 0.005467414855957031,
|
| 870 |
+
"InsertOffloadedTransposes": 0.025030136108398438,
|
| 871 |
+
"LICM": 0.010097026824951172,
|
| 872 |
+
"LateLegalizeInst": 0.010406017303466797,
|
| 873 |
+
"LateLegalizePostSplit": 0.020189762115478516,
|
| 874 |
+
"LateLowerReshapeOp": 0.0018696784973144531,
|
| 875 |
+
"LateLowerTensorOp": 0.0022716522216796875,
|
| 876 |
+
"LateNeuronInstComb": 0.022235631942749023,
|
| 877 |
+
"LayoutPreprocessing": 0.05716848373413086,
|
| 878 |
+
"LayoutPreprocessingAndAnalysis": 0.12559008598327637,
|
| 879 |
+
"LayoutRequirementAnalysis": 0.01263284683227539,
|
| 880 |
+
"LegalizeCCOpLayout": 0.003709077835083008,
|
| 881 |
+
"LegalizeOpLevelAlias": 0.0016541481018066406,
|
| 882 |
+
"LegalizePartitionReduce": 0.007805347442626953,
|
| 883 |
+
"LegalizeSundaAccess": 0.09120893478393555,
|
| 884 |
+
"LegalizeSundaMacro": 0.020558595657348633,
|
| 885 |
+
"LegalizeType": 0.006526947021484375,
|
| 886 |
+
"LocalLayoutOpt": 0.04371356964111328,
|
| 887 |
+
"LoopFusion": 0.03305792808532715,
|
| 888 |
+
"LoopSplitting": 0.0017974376678466797,
|
| 889 |
+
"LowerBroadcast": 0.005987882614135742,
|
| 890 |
+
"LowerCCOpBlockAxis": 0.013673782348632813,
|
| 891 |
+
"LowerComplexBroadcast": 0.005238771438598633,
|
| 892 |
+
"LowerIntrinsics": 0.04390692710876465,
|
| 893 |
+
"LowerShardAxis": 0.02148151397705078,
|
| 894 |
+
"LowerTensorOp": 0.011847496032714844,
|
| 895 |
+
"LowerToSendRecv": 0.03099536895751953,
|
| 896 |
+
"LowerTranspose": 0.022028207778930664,
|
| 897 |
+
"MacroGeneration": 0.11886835098266602,
|
| 898 |
+
"MaskPropagation": 0.01356053352355957,
|
| 899 |
+
"MemcpyElimination": 0.050164222717285156,
|
| 900 |
+
"MutateDataType": 0.0028362274169921875,
|
| 901 |
+
"NeuronAliasDependencyInduction": 0.0024106502532958984,
|
| 902 |
+
"NeuronAliasDependencyReset": 0.07959818840026855,
|
| 903 |
+
"NeuronInstComb": 0.024571895599365234,
|
| 904 |
+
"NeuronLICM": 0.019634723663330078,
|
| 905 |
+
"NeuronLoopFusion": 0.0700373649597168,
|
| 906 |
+
"NeuronLoopInterchange": 0.003496885299682617,
|
| 907 |
+
"NeuronSimplifier": 0.0175168514251709,
|
| 908 |
+
"NeuronSimplifyPredicates": 0.01945638656616211,
|
| 909 |
+
"NeuronValueNumbering": 0.014354467391967773,
|
| 910 |
+
"OptimizeAliasedCopyChain": 0.0008881092071533203,
|
| 911 |
+
"OptimizeNKIKernels": 4.497897148132324,
|
| 912 |
+
"PAGLayoutOpt": 0.11170005798339844,
|
| 913 |
+
"PComputeCutting": 0.02699899673461914,
|
| 914 |
+
"PGLayoutTilingPipeline": 1.7730352878570557,
|
| 915 |
+
"PGTiling": 0.4928562641143799,
|
| 916 |
+
"PadElimination": 0.0005004405975341797,
|
| 917 |
+
"ParAxesAnnotation": 0.08141517639160156,
|
| 918 |
+
"PartialLoopFusion": 0.05184769630432129,
|
| 919 |
+
"PartialSimdFusion": 0.019034385681152344,
|
| 920 |
+
"PerfectLoopNest": 0.005218982696533203,
|
| 921 |
+
"RecognizeOpIdiom": 0.028120994567871094,
|
| 922 |
+
"Recompute": 0.0006320476531982422,
|
| 923 |
+
"RelaxPredicates": 0.012555122375488281,
|
| 924 |
+
"Rematerialization": 0.002846240997314453,
|
| 925 |
+
"RemoveShardedPartitionAxes": 0.028553009033203125,
|
| 926 |
+
"ReshapeWeights": 0.0013833045959472656,
|
| 927 |
+
"ResolveAccessConflict": 0.007452726364135742,
|
| 928 |
+
"ResolveComplicatePredicates": 0.002027273178100586,
|
| 929 |
+
"RewriteReplicationMatmul": 0.0019905567169189453,
|
| 930 |
+
"RewriteWeights": 0.005997419357299805,
|
| 931 |
+
"SFKVectorizer": 0.20844674110412598,
|
| 932 |
+
"ShardingPropagationAnalysis": 0.11750531196594238,
|
| 933 |
+
"SimpleAllReduceTiling": 0.0042400360107421875,
|
| 934 |
+
"Simplifier": 0.01620769500732422,
|
| 935 |
+
"SimplifyMacroPredicates": 0.03200030326843262,
|
| 936 |
+
"SimplifyNeuronTensor": 0.016496896743774414,
|
| 937 |
+
"SimplifySlice": 0.002093076705932617,
|
| 938 |
+
"SimplifyTensor": 0.01188349723815918,
|
| 939 |
+
"SpillPSum": 0.019929170608520508,
|
| 940 |
+
"SplitAPUnionSets": 0.09830927848815918,
|
| 941 |
+
"SplitAccGrp": 0.003184795379638672,
|
| 942 |
+
"StaticProfiler": 0.024499177932739258,
|
| 943 |
+
"StaticTransposeLocalTensor": 0.013921499252319336,
|
| 944 |
+
"SundaISel": 0.12911200523376465,
|
| 945 |
+
"TCTransform": 0.01076197624206543,
|
| 946 |
+
"TensorInitialization": 0.015585660934448242,
|
| 947 |
+
"TensorOpSimplifier": 0.009182214736938477,
|
| 948 |
+
"TensorOpTransform": 0.02479076385498047,
|
| 949 |
+
"TileCCOps": 0.01529073715209961,
|
| 950 |
+
"TilingProfiler": 0.02448558807373047,
|
| 951 |
+
"TransformConvOp": 0.0032668113708496094,
|
| 952 |
+
"TritiumFusion": 0.07947993278503418,
|
| 953 |
+
"ValueNumbering": 0.008611917495727539,
|
| 954 |
+
"VectorizeDMA": 0.008882284164428711,
|
| 955 |
+
"VectorizeMatMult": 0.013601303100585938,
|
| 956 |
+
"WeightCoalescing": 0.0029730796813964844,
|
| 957 |
+
"ZeroSizeTensorElimination": 0.00017452239990234375
|
| 958 |
+
},
|
| 959 |
+
"tensorizer": {
|
| 960 |
+
"DMATilingProfiler::TotalInstructionsAfterTiling": 22051.0,
|
| 961 |
+
"StaticProfiler::AifUb": 173.52798461914063,
|
| 962 |
+
"StaticProfiler::ArithmeticIntensityTensorizer": 150.2424774169922,
|
| 963 |
+
"StaticProfiler::AverageDmaLength": 2589.193359375,
|
| 964 |
+
"StaticProfiler::AverageFractalPeUtilization": 98.77135467529297,
|
| 965 |
+
"StaticProfiler::AveragePartitionUtilization": 94.32398223876953,
|
| 966 |
+
"StaticProfiler::AveragePeUtilization": 96.75625610351563,
|
| 967 |
+
"StaticProfiler::DDRTransferBytes": 407886880.0,
|
| 968 |
+
"StaticProfiler::InternalTransferBytes": 327079712.0,
|
| 969 |
+
"StaticProfiler::LoadExpanded": 89436.0,
|
| 970 |
+
"StaticProfiler::LocalizationEfficiency": 86.58112335205078,
|
| 971 |
+
"StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 89.48306274414063,
|
| 972 |
+
"StaticProfiler::StoreExpanded": 2154.0,
|
| 973 |
+
"StaticProfiler::TotalDMAExpanded": 91590.0,
|
| 974 |
+
"StaticProfiler::TotalDynamicInstancesCount": 26447.0,
|
| 975 |
+
"StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 25996.0,
|
| 976 |
+
"StaticProfiler::TotalLNCComm": 0.0,
|
| 977 |
+
"StaticProfiler::TotalLNCCommTransfer": 0.0,
|
| 978 |
+
"TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0,
|
| 979 |
+
"TilingProfiler::AveragePeUtilizationAfterTiling": 0.0,
|
| 980 |
+
"TilingProfiler::BatchnormInstructionsAfterTiling": 0.0,
|
| 981 |
+
"TilingProfiler::DmaInstructionsAfterTiling": 0.0,
|
| 982 |
+
"TilingProfiler::GenericInstructionsAfterTiling": 4.0,
|
| 983 |
+
"TilingProfiler::MatMultInstructionsAfterTiling": 11424.0,
|
| 984 |
+
"TilingProfiler::NumPfTransposes": 6.0,
|
| 985 |
+
"TilingProfiler::NumPfTransposesForIo": 1.0,
|
| 986 |
+
"TilingProfiler::NumPfTransposesForLocal": 1.0,
|
| 987 |
+
"TilingProfiler::NumPfTransposesForNonlocal": 4.0,
|
| 988 |
+
"TilingProfiler::PfTransposeInstructions": 10291.0,
|
| 989 |
+
"TilingProfiler::PfTransposeInstructionsForIo": 9504.0,
|
| 990 |
+
"TilingProfiler::PfTransposeInstructionsForLocal": 1.0,
|
| 991 |
+
"TilingProfiler::PfTransposeInstructionsForNonlocal": 786.0,
|
| 992 |
+
"TilingProfiler::ReduceInstructionsAfterTiling": 4.0,
|
| 993 |
+
"TilingProfiler::SimdInstructionsAfterTiling": 164.0,
|
| 994 |
+
"TilingProfiler::TotalInstructionsAfterTiling": 0.0,
|
| 995 |
+
"TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0,
|
| 996 |
+
"TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0,
|
| 997 |
+
"TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0,
|
| 998 |
+
"TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0,
|
| 999 |
+
"TransformConvOp::conv2d_column_packing": 0.0,
|
| 1000 |
+
"TransformConvOp::conv2d_column_packing_1": 0.0,
|
| 1001 |
+
"TransformConvOp::conv2d_column_packing_io10": 0.0,
|
| 1002 |
+
"TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0
|
| 1003 |
+
}
|
| 1004 |
+
},
|
| 1005 |
+
"sg01": {
|
| 1006 |
+
"compiletime": {
|
| 1007 |
+
"CanonicalizeConv": 9.999999974752427e-07,
|
| 1008 |
+
"CanonicalizeForTensorizer": 1.4000000192027073e-05,
|
| 1009 |
+
"Canonicalizer": 0.00020799999765586108,
|
| 1010 |
+
"HoistCompute": 4.999999873689376e-06,
|
| 1011 |
+
"IdentifyCrossPassTensors": 1.5999999959603883e-05,
|
| 1012 |
+
"MemcastMotion": 7.000000096013537e-06,
|
| 1013 |
+
"PenguinizeFunctions": 1.2000000424450263e-05,
|
| 1014 |
+
"PruneFunctions": 1.5999999959603883e-05,
|
| 1015 |
+
"RemoveOptimizationBarriers": 2.2000000171829015e-05,
|
| 1016 |
+
"ScatterMotion": 1.1000000085914508e-05,
|
| 1017 |
+
"TensorizerLegalizationPass": 1.700000029813964e-05,
|
| 1018 |
+
"VerifySupportedOps": 9.999999747378752e-06,
|
| 1019 |
+
"algsimp": 4.5000000682193786e-05,
|
| 1020 |
+
"batchnorm_expander": 1.2000000424450263e-05,
|
| 1021 |
+
"boundary-marker-removal": 3.999999989900971e-06,
|
| 1022 |
+
"call-inliner": 7.999999979801942e-06,
|
| 1023 |
+
"canonicalize-boundary-marker": 4.999999873689376e-06,
|
| 1024 |
+
"collective-stream-id-checker": 3.000000106112566e-06,
|
| 1025 |
+
"comparison-expander": 4.999999873689376e-06,
|
| 1026 |
+
"computation-deduplicator": 1.8999999156221747e-05,
|
| 1027 |
+
"config-lowering": 2.8000000384054147e-05,
|
| 1028 |
+
"constant_folding": 7.000000096013537e-06,
|
| 1029 |
+
"cse": 9.999999747378752e-06,
|
| 1030 |
+
"dce": 9.999999974752427e-07,
|
| 1031 |
+
"dynamic-slice-transpose": 3.999999989900971e-06,
|
| 1032 |
+
"eliminate-redundant-compare": 3.000000106112566e-06,
|
| 1033 |
+
"emit-offloaded-dropout": 1.1000000085914508e-05,
|
| 1034 |
+
"flatten-call-graph": 6.000000212225132e-06,
|
| 1035 |
+
"fuse-send-recv": 1.8999999156221747e-05,
|
| 1036 |
+
"hilo-conditional-to-select": 3.999999989900971e-06,
|
| 1037 |
+
"hilo::LegalizeAlias": 4.999999873689376e-06,
|
| 1038 |
+
"hilo::NeuronInstCombine": 5.199999941396527e-05,
|
| 1039 |
+
"hilo::NeuronOpFusion": 1.1000000085914508e-05,
|
| 1040 |
+
"hilo::ReplaceTokenTypeWithU8Pass": 6.000000212225132e-06,
|
| 1041 |
+
"hilo::ScheduleFusion": 9.999999974752427e-07,
|
| 1042 |
+
"hilo::SixtyFourHack": 1.2000000424450263e-05,
|
| 1043 |
+
"hilo::VerifyAliasing": 1.9999999949504854e-06,
|
| 1044 |
+
"hlo-mac-count": 9.699999645818025e-05,
|
| 1045 |
+
"legalize-ccops-for-tensorizer": 9.999999974752427e-07,
|
| 1046 |
+
"legalize-compare": 3.999999989900971e-06,
|
| 1047 |
+
"lower-argminmax-custom-call": 3.000000106112566e-06,
|
| 1048 |
+
"map-inline": 9.999999747378752e-06,
|
| 1049 |
+
"metadata-naming": 1.8000000636675395e-05,
|
| 1050 |
+
"mlir::detail::OpToOpPassAdaptor": 1.9999999494757503e-05,
|
| 1051 |
+
"mlir::hlo::MhloToPyPenguin": 0.001829999964684248,
|
| 1052 |
+
"mlir::mhlo::LowerComplexExtraPass": 0.00011999999696854502,
|
| 1053 |
+
"mlir::mhlo::LowerComplexPass": 0.0001849999971454963,
|
| 1054 |
+
"native-to-custom-softmax": 4.999999873689376e-06,
|
| 1055 |
+
"native-to-custom-softmax-dx": 1.2000000424450263e-05,
|
| 1056 |
+
"neuron-hlo-verifier": 0.00036700000055134296,
|
| 1057 |
+
"operand_upcaster": 1.4000000192027073e-05,
|
| 1058 |
+
"post-par-pipe-begin": 9.999999974752427e-07,
|
| 1059 |
+
"post-par-pipe-end": 0.0,
|
| 1060 |
+
"post-partition-simplification": 0.0004250000056345016,
|
| 1061 |
+
"replace-minimum-constant": 4.999999873689376e-06,
|
| 1062 |
+
"reshape-mover": 1.9999999949504854e-06,
|
| 1063 |
+
"simplify-concat": 3.5000000934815034e-05,
|
| 1064 |
+
"simplify-while-loops": 1.9999999949504854e-06,
|
| 1065 |
+
"transform-variadic-reduce": 7.000000096013537e-06,
|
| 1066 |
+
"tuple-simplifier": 3.999999989900971e-06,
|
| 1067 |
+
"unpack-nested-aws-ntwsr": 3.000000106112566e-06,
|
| 1068 |
+
"unroll-while-loop": 0.0
|
| 1069 |
+
},
|
| 1070 |
+
"hilo": {
|
| 1071 |
+
"ArithmeticIntensity": 201.87655639648438,
|
| 1072 |
+
"HloMacCount": 13153337344.0,
|
| 1073 |
+
"Traffic": 130310688.0
|
| 1074 |
+
}
|
| 1075 |
+
},
|
| 1076 |
+
"sg02": {
|
| 1077 |
+
"compiletime": {
|
| 1078 |
+
"CanonicalizeConv": 9.000000318337698e-06,
|
| 1079 |
+
"CanonicalizeForTensorizer": 1.2000000424450263e-05,
|
| 1080 |
+
"Canonicalizer": 0.0002739999908953905,
|
| 1081 |
+
"HoistCompute": 0.0,
|
| 1082 |
+
"IdentifyCrossPassTensors": 1.4999999621068127e-05,
|
| 1083 |
+
"MemcastMotion": 9.999999974752427e-07,
|
| 1084 |
+
"PenguinizeFunctions": 1.1000000085914508e-05,
|
| 1085 |
+
"PruneFunctions": 7.999999979801942e-06,
|
| 1086 |
+
"RemoveOptimizationBarriers": 3.899999865097925e-05,
|
| 1087 |
+
"ScatterMotion": 3.000000106112566e-06,
|
| 1088 |
+
"TensorizerLegalizationPass": 6.000000212225132e-06,
|
| 1089 |
+
"VerifySupportedOps": 1.2000000424450263e-05,
|
| 1090 |
+
"algsimp": 4.8000001697801054e-05,
|
| 1091 |
+
"batchnorm_expander": 1.1000000085914508e-05,
|
| 1092 |
+
"boundary-marker-removal": 3.000000106112566e-06,
|
| 1093 |
+
"call-inliner": 9.999999747378752e-06,
|
| 1094 |
+
"canonicalize-boundary-marker": 3.999999989900971e-06,
|
| 1095 |
+
"collective-stream-id-checker": 3.000000106112566e-06,
|
| 1096 |
+
"comparison-expander": 4.999999873689376e-06,
|
| 1097 |
+
"computation-deduplicator": 1.8999999156221747e-05,
|
| 1098 |
+
"config-lowering": 3.600000127335079e-05,
|
| 1099 |
+
"constant_folding": 7.000000096013537e-06,
|
| 1100 |
+
"cse": 1.2000000424450263e-05,
|
| 1101 |
+
"dce": 9.999999974752427e-07,
|
| 1102 |
+
"dynamic-slice-transpose": 3.999999989900971e-06,
|
| 1103 |
+
"eliminate-redundant-compare": 3.000000106112566e-06,
|
| 1104 |
+
"emit-offloaded-dropout": 1.1000000085914508e-05,
|
| 1105 |
+
"flatten-call-graph": 9.999999747378752e-06,
|
| 1106 |
+
"fuse-send-recv": 1.700000029813964e-05,
|
| 1107 |
+
"hilo-conditional-to-select": 4.999999873689376e-06,
|
| 1108 |
+
"hilo::LegalizeAlias": 1.9999999949504854e-06,
|
| 1109 |
+
"hilo::NeuronInstCombine": 4.400000034365803e-05,
|
| 1110 |
+
"hilo::NeuronOpFusion": 4.999999873689376e-06,
|
| 1111 |
+
"hilo::ReplaceTokenTypeWithU8Pass": 1.2999999853491317e-05,
|
| 1112 |
+
"hilo::ScheduleFusion": 3.999999989900971e-06,
|
| 1113 |
+
"hilo::SixtyFourHack": 4.099999932805076e-05,
|
| 1114 |
+
"hilo::VerifyAliasing": 9.999999974752427e-07,
|
| 1115 |
+
"hlo-mac-count": 0.004902000073343515,
|
| 1116 |
+
"legalize-ccops-for-tensorizer": 9.999999974752427e-07,
|
| 1117 |
+
"legalize-compare": 3.999999989900971e-06,
|
| 1118 |
+
"lower-argminmax-custom-call": 3.000000106112566e-06,
|
| 1119 |
+
"map-inline": 9.999999747378752e-06,
|
| 1120 |
+
"metadata-naming": 1.4000000192027073e-05,
|
| 1121 |
+
"mlir::detail::OpToOpPassAdaptor": 2.499999936844688e-05,
|
| 1122 |
+
"mlir::hlo::MhloToPyPenguin": 0.005096000153571367,
|
| 1123 |
+
"mlir::mhlo::LowerComplexExtraPass": 9.40000027185306e-05,
|
| 1124 |
+
"mlir::mhlo::LowerComplexPass": 0.00016599999798927456,
|
| 1125 |
+
"native-to-custom-softmax": 6.000000212225132e-06,
|
| 1126 |
+
"native-to-custom-softmax-dx": 2.4000000848900527e-05,
|
| 1127 |
+
"neuron-hlo-verifier": 0.00033099998836405575,
|
| 1128 |
+
"operand_upcaster": 1.5999999959603883e-05,
|
| 1129 |
+
"post-par-pipe-begin": 3.999999989900971e-06,
|
| 1130 |
+
"post-par-pipe-end": 0.0,
|
| 1131 |
+
"post-partition-simplification": 0.00046400001156143844,
|
| 1132 |
+
"replace-minimum-constant": 7.000000096013537e-06,
|
| 1133 |
+
"reshape-mover": 1.9999999949504854e-06,
|
| 1134 |
+
"simplify-concat": 3.199999991920777e-05,
|
| 1135 |
+
"simplify-while-loops": 1.9999999949504854e-06,
|
| 1136 |
+
"transform-variadic-reduce": 4.5000000682193786e-05,
|
| 1137 |
+
"tuple-simplifier": 3.999999989900971e-06,
|
| 1138 |
+
"unpack-nested-aws-ntwsr": 3.999999989900971e-06,
|
| 1139 |
+
"unroll-while-loop": 0.0
|
| 1140 |
+
},
|
| 1141 |
+
"hilo": {
|
| 1142 |
+
"ArithmeticIntensity": 55.24231719970703,
|
| 1143 |
+
"HloMacCount": 9820307456.0,
|
| 1144 |
+
"Traffic": 355535680.0
|
| 1145 |
+
}
|
| 1146 |
+
},
|
| 1147 |
+
"topk": {
|
| 1148 |
+
"compiletime": {
|
| 1149 |
+
"CoalesceCCOp": 0.012721538543701172,
|
| 1150 |
+
"DMALocalityOpt": 0.00609898567199707,
|
| 1151 |
+
"DMAProfiler": 0.007831335067749023,
|
| 1152 |
+
"DataStreaming": 0.01673150062561035,
|
| 1153 |
+
"DoNothing": 0.0002722740173339844,
|
| 1154 |
+
"ExpandISAMacro": 0.0056455135345458984,
|
| 1155 |
+
"FactorizeBlkDims": 0.0197756290435791,
|
| 1156 |
+
"InferPSumTensor": 0.023047685623168945,
|
| 1157 |
+
"InferSharedMemLoc": 0.011858940124511719,
|
| 1158 |
+
"InsertCoreBarrier": 0.011088848114013672,
|
| 1159 |
+
"LateLegalizeInst": 0.02294301986694336,
|
| 1160 |
+
"LateNeuronInstComb": 0.03573012351989746,
|
| 1161 |
+
"LegalizeSundaAccess": 0.04056549072265625,
|
| 1162 |
+
"LegalizeType": 0.036716461181640625,
|
| 1163 |
+
"LowerBroadcast": 0.009067773818969727,
|
| 1164 |
+
"LowerIntrinsics": 0.0156552791595459,
|
| 1165 |
+
"LowerTranspose": 0.004080295562744141,
|
| 1166 |
+
"NeuronInstComb": 0.030441999435424805,
|
| 1167 |
+
"NeuronLICM": 0.03961777687072754,
|
| 1168 |
+
"NeuronSimplifyPredicates": 0.012285470962524414,
|
| 1169 |
+
"NeuronValueNumbering": 0.007288455963134766,
|
| 1170 |
+
"SFKVectorizer": 0.06282949447631836,
|
| 1171 |
+
"SimpleAllReduceTiling": 0.016891002655029297,
|
| 1172 |
+
"SimplifyNeuronTensor": 0.08206772804260254,
|
| 1173 |
+
"SpillPSum": 0.045392751693725586,
|
| 1174 |
+
"WeightCoalescing": 0.0072481632232666016
|
| 1175 |
+
}
|
| 1176 |
+
}
|
| 1177 |
+
}
|
context_encoding_model/_tp0_bk2/graph.neff
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:06ba2911f0e007b1f4ad7d888115d6589d3bf2b988bbc6b3bc84a1db0766bb48
|
| 3 |
+
size 1342464
|
context_encoding_model/_tp0_bk2/log-neuron-cc.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
context_encoding_model/_tp0_bk2/metaneff.pb
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e16c9f7e6763d8d2b02577a4b90bcb120069c7fe5bb1001520c159d08abf614c
|
| 3 |
+
size 2610412
|
context_encoding_model/_tp0_bk2/model.MODULE_49bb42f69f5b159ae769+3467f95e.hlo_module.pb
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c22ff4f27dafd3772342a93352c9b5a2c076d1824cec83419ac3d1f8c07d4e2f
|
| 3 |
+
size 2697198
|
context_encoding_model/_tp0_bk2/model.MODULE_49bb42f69f5b159ae769+3467f95e.neff
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:06ba2911f0e007b1f4ad7d888115d6589d3bf2b988bbc6b3bc84a1db0766bb48
|
| 3 |
+
size 1342464
|
context_encoding_model/_tp0_bk2/neuron_config.json
ADDED
|
@@ -0,0 +1,224 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_attn_implementation_autoset": false,
|
| 3 |
+
"_name_or_path": "/home/ubuntu/models/Qwen3-1.7B",
|
| 4 |
+
"add_cross_attention": false,
|
| 5 |
+
"architectures": [
|
| 6 |
+
"Qwen3ForCausalLM"
|
| 7 |
+
],
|
| 8 |
+
"attention_bias": false,
|
| 9 |
+
"attention_dropout": 0.0,
|
| 10 |
+
"attribute_map": {},
|
| 11 |
+
"bad_words_ids": null,
|
| 12 |
+
"begin_suppress_tokens": null,
|
| 13 |
+
"bos_token_id": 151643,
|
| 14 |
+
"chunk_size_feed_forward": 0,
|
| 15 |
+
"cross_attention_hidden_size": null,
|
| 16 |
+
"decoder_start_token_id": null,
|
| 17 |
+
"diversity_penalty": 0.0,
|
| 18 |
+
"do_sample": false,
|
| 19 |
+
"early_stopping": false,
|
| 20 |
+
"encoder_no_repeat_ngram_size": 0,
|
| 21 |
+
"eos_token_id": 151645,
|
| 22 |
+
"exponential_decay_length_penalty": null,
|
| 23 |
+
"finetuning_task": null,
|
| 24 |
+
"forced_bos_token_id": null,
|
| 25 |
+
"forced_eos_token_id": null,
|
| 26 |
+
"fused_spec_config": null,
|
| 27 |
+
"head_dim": 128,
|
| 28 |
+
"hidden_act": "silu",
|
| 29 |
+
"hidden_size": 2048,
|
| 30 |
+
"id2label": {
|
| 31 |
+
"0": "LABEL_0",
|
| 32 |
+
"1": "LABEL_1"
|
| 33 |
+
},
|
| 34 |
+
"initializer_range": 0.02,
|
| 35 |
+
"intermediate_size": 6144,
|
| 36 |
+
"is_decoder": false,
|
| 37 |
+
"is_encoder_decoder": false,
|
| 38 |
+
"label2id": {
|
| 39 |
+
"LABEL_0": 0,
|
| 40 |
+
"LABEL_1": 1
|
| 41 |
+
},
|
| 42 |
+
"length_penalty": 1.0,
|
| 43 |
+
"max_length": 20,
|
| 44 |
+
"max_position_embeddings": 40960,
|
| 45 |
+
"max_window_layers": 28,
|
| 46 |
+
"metadata": null,
|
| 47 |
+
"min_length": 0,
|
| 48 |
+
"model_type": "qwen3",
|
| 49 |
+
"neuron_config": {
|
| 50 |
+
"activation_quantization_type": null,
|
| 51 |
+
"allow_input_truncation": false,
|
| 52 |
+
"apply_seq_ids_mask": false,
|
| 53 |
+
"async_mode": false,
|
| 54 |
+
"attention_dp_degree": 1,
|
| 55 |
+
"attention_dtype": null,
|
| 56 |
+
"attn_block_cte_nki_kernel_enabled": false,
|
| 57 |
+
"attn_block_tkg_nki_kernel_cache_update": false,
|
| 58 |
+
"attn_block_tkg_nki_kernel_cascaded_attention": false,
|
| 59 |
+
"attn_block_tkg_nki_kernel_enabled": false,
|
| 60 |
+
"attn_cls": {
|
| 61 |
+
"__module__": "neuronx_distributed_inference.models.qwen3.modeling_qwen3",
|
| 62 |
+
"__name__": "NeuronQwen3Attention"
|
| 63 |
+
},
|
| 64 |
+
"attn_kernel_enabled": null,
|
| 65 |
+
"attn_tkg_builtin_kernel_enabled": false,
|
| 66 |
+
"attn_tkg_nki_kernel_enabled": false,
|
| 67 |
+
"batch_size": 1,
|
| 68 |
+
"bucket_n_active_tokens": true,
|
| 69 |
+
"buckets": [
|
| 70 |
+
512
|
| 71 |
+
],
|
| 72 |
+
"cast_type": "config",
|
| 73 |
+
"cc_pipeline_tiling_factor": 2,
|
| 74 |
+
"chunked_prefill_config": null,
|
| 75 |
+
"context_encoding_buckets": [
|
| 76 |
+
512
|
| 77 |
+
],
|
| 78 |
+
"cp_degree": 1,
|
| 79 |
+
"ctx_batch_size": 1,
|
| 80 |
+
"disable_kv_cache_tiling": false,
|
| 81 |
+
"draft_model_modules_to_not_convert": null,
|
| 82 |
+
"enable_bucketing": true,
|
| 83 |
+
"enable_cte_modular_flow": false,
|
| 84 |
+
"enable_eagle_draft_input_norm": false,
|
| 85 |
+
"enable_eagle_speculation": false,
|
| 86 |
+
"enable_fused_speculation": false,
|
| 87 |
+
"enable_long_context_mode": false,
|
| 88 |
+
"enable_output_completion_notifications": false,
|
| 89 |
+
"enable_spill_reload_dge": false,
|
| 90 |
+
"enable_token_tree": false,
|
| 91 |
+
"ep_degree": 1,
|
| 92 |
+
"expert_mlp_nki_kernel_enabled": null,
|
| 93 |
+
"flash_decoding_enabled": false,
|
| 94 |
+
"fused_qkv": false,
|
| 95 |
+
"fused_rmsnorm_skip_gamma": false,
|
| 96 |
+
"is_block_kv_layout": null,
|
| 97 |
+
"is_chunked_prefill": false,
|
| 98 |
+
"is_continuous_batching": true,
|
| 99 |
+
"is_eagle_draft": false,
|
| 100 |
+
"is_medusa": false,
|
| 101 |
+
"is_prefill_stage": true,
|
| 102 |
+
"is_prefix_caching": false,
|
| 103 |
+
"k_cache_transposed": false,
|
| 104 |
+
"kv_cache_batch_size": 8,
|
| 105 |
+
"kv_cache_padding_size": 0,
|
| 106 |
+
"kv_cache_quant": false,
|
| 107 |
+
"kv_cache_tiling": false,
|
| 108 |
+
"layer_boundary_markers": false,
|
| 109 |
+
"lm_head_pad": true,
|
| 110 |
+
"lm_head_pad_alignment_size": 1,
|
| 111 |
+
"local_ranks_size": 2,
|
| 112 |
+
"logical_nc_config": 2,
|
| 113 |
+
"lora_config": null,
|
| 114 |
+
"max_batch_size": 8,
|
| 115 |
+
"max_context_length": 4096,
|
| 116 |
+
"max_length": 4096,
|
| 117 |
+
"max_new_tokens": null,
|
| 118 |
+
"medusa_speculation_length": 0,
|
| 119 |
+
"medusa_tree": null,
|
| 120 |
+
"mlp_kernel_enabled": false,
|
| 121 |
+
"mlp_kernel_fuse_residual_add": false,
|
| 122 |
+
"modules_to_not_convert": null,
|
| 123 |
+
"moe_fused_nki_kernel_enabled": null,
|
| 124 |
+
"n_active_tokens": 4096,
|
| 125 |
+
"n_positions": 4096,
|
| 126 |
+
"num_medusa_heads": 0,
|
| 127 |
+
"on_cpu": false,
|
| 128 |
+
"on_device_sampling_config": {
|
| 129 |
+
"deterministic": false,
|
| 130 |
+
"do_sample": false,
|
| 131 |
+
"dynamic": true,
|
| 132 |
+
"global_topk": 256,
|
| 133 |
+
"on_device_sampling_config": true,
|
| 134 |
+
"temperature": 1.0,
|
| 135 |
+
"top_k": 1,
|
| 136 |
+
"top_k_kernel_enabled": false,
|
| 137 |
+
"top_p": 1.0
|
| 138 |
+
},
|
| 139 |
+
"output_logits": false,
|
| 140 |
+
"overrides_torch_dtype": true,
|
| 141 |
+
"pa_block_size": 4096,
|
| 142 |
+
"pa_num_blocks": 8,
|
| 143 |
+
"padding_side": "right",
|
| 144 |
+
"pp_degree": 1,
|
| 145 |
+
"prefix_buckets": null,
|
| 146 |
+
"qk_layernorm": false,
|
| 147 |
+
"qkv_kernel_enabled": false,
|
| 148 |
+
"qkv_kernel_fuse_residual_add": false,
|
| 149 |
+
"qkv_kernel_nbsd_layout": false,
|
| 150 |
+
"quantization_dtype": "int8",
|
| 151 |
+
"quantization_type": "per_tensor_symmetric",
|
| 152 |
+
"quantize_clamp_bound": Infinity,
|
| 153 |
+
"quantized": false,
|
| 154 |
+
"quantized_checkpoints_path": null,
|
| 155 |
+
"quantized_mlp_kernel_enabled": false,
|
| 156 |
+
"rmsnorm_quantize_kernel_enabled": false,
|
| 157 |
+
"router_topk_nki_kernel_enabled": null,
|
| 158 |
+
"rpl_reduce_dtype": null,
|
| 159 |
+
"save_sharded_checkpoint": true,
|
| 160 |
+
"scratchpad_page_size": null,
|
| 161 |
+
"seq_len": 4096,
|
| 162 |
+
"seq_len_threshold_for_cc_tiling": 16384,
|
| 163 |
+
"sequence_parallel_enabled": false,
|
| 164 |
+
"shared_mlp_nki_kernel_enabled": null,
|
| 165 |
+
"skip_sharding": false,
|
| 166 |
+
"skip_warmup": false,
|
| 167 |
+
"spec_batch_size": 8,
|
| 168 |
+
"speculation_length": 0,
|
| 169 |
+
"start_rank_id": 0,
|
| 170 |
+
"strided_context_parallel_kernel_enabled": false,
|
| 171 |
+
"target": null,
|
| 172 |
+
"tensor_capture_config": null,
|
| 173 |
+
"tile_cc": false,
|
| 174 |
+
"tkg_batch_size": 8,
|
| 175 |
+
"token_generation_buckets": null,
|
| 176 |
+
"token_tree_config": null,
|
| 177 |
+
"torch_dtype": "bfloat16",
|
| 178 |
+
"tp_degree": 2,
|
| 179 |
+
"vocab_parallel": false,
|
| 180 |
+
"weight_gather_seq_len_threshold": 32768,
|
| 181 |
+
"weights_to_skip_layout_optimization": [],
|
| 182 |
+
"world_size": 2
|
| 183 |
+
},
|
| 184 |
+
"no_repeat_ngram_size": 0,
|
| 185 |
+
"num_attention_heads": 16,
|
| 186 |
+
"num_beam_groups": 1,
|
| 187 |
+
"num_beams": 1,
|
| 188 |
+
"num_cores_per_group": 1,
|
| 189 |
+
"num_hidden_layers": 28,
|
| 190 |
+
"num_key_value_heads": 8,
|
| 191 |
+
"num_return_sequences": 1,
|
| 192 |
+
"output_attentions": false,
|
| 193 |
+
"output_hidden_states": false,
|
| 194 |
+
"output_scores": false,
|
| 195 |
+
"pad_token_id": 0,
|
| 196 |
+
"prefix": null,
|
| 197 |
+
"problem_type": null,
|
| 198 |
+
"pruned_heads": {},
|
| 199 |
+
"remove_invalid_values": false,
|
| 200 |
+
"repetition_penalty": 1.0,
|
| 201 |
+
"return_dict": true,
|
| 202 |
+
"return_dict_in_generate": false,
|
| 203 |
+
"rms_norm_eps": 1e-06,
|
| 204 |
+
"rope_scaling": null,
|
| 205 |
+
"rope_theta": 1000000,
|
| 206 |
+
"sep_token_id": null,
|
| 207 |
+
"sliding_window": null,
|
| 208 |
+
"suppress_tokens": null,
|
| 209 |
+
"task_specific_params": null,
|
| 210 |
+
"temperature": 1.0,
|
| 211 |
+
"tf_legacy_loss": false,
|
| 212 |
+
"tie_encoder_decoder": false,
|
| 213 |
+
"tie_word_embeddings": true,
|
| 214 |
+
"tokenizer_class": null,
|
| 215 |
+
"top_k": 50,
|
| 216 |
+
"top_p": 1.0,
|
| 217 |
+
"torchscript": false,
|
| 218 |
+
"transformers_version": "4.51.0",
|
| 219 |
+
"typical_p": 1.0,
|
| 220 |
+
"use_bfloat16": false,
|
| 221 |
+
"use_cache": true,
|
| 222 |
+
"use_sliding_window": false,
|
| 223 |
+
"vocab_size": 151936
|
| 224 |
+
}
|
context_encoding_model/_tp0_bk3/command.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
neuronx-cc compile --framework=XLA model.MODULE_be035899334776123ed5+d208bdce.hlo_module.pb --output model.MODULE_be035899334776123ed5+d208bdce.neff --target=trn2 --auto-cast=none --model-type=transformer '--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma ' --lnc=2 -O1 '--internal-hlo2tensorizer-options= --modular-flow-mac-threshold=10 --verify-hlo=true' --logfile=log-neuron-cc.txt --verbose=35
|
context_encoding_model/_tp0_bk3/compile_flags.MODULE_be035899334776123ed5+d208bdce.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
["--target=trn2", "--auto-cast=none", "--model-type=transformer", "--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma ", "--lnc=2", "-O1", "--internal-hlo2tensorizer-options= --modular-flow-mac-threshold=10 --verify-hlo=true", "--logfile=/home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk3/log-neuron-cc.txt"]
|
context_encoding_model/_tp0_bk3/global_metric_store.json
ADDED
|
@@ -0,0 +1,1177 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"Average": {
|
| 3 |
+
"tensorizer": {
|
| 4 |
+
"StaticProfiler::AverageFractalPeUtilization": 98.80319213867188,
|
| 5 |
+
"StaticProfiler::AveragePartitionUtilization": 94.51075744628906,
|
| 6 |
+
"StaticProfiler::AveragePeUtilization": 96.83863067626953,
|
| 7 |
+
"StaticProfiler::LocalizationEfficiency": 84.98564147949219,
|
| 8 |
+
"StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 89.59233093261719,
|
| 9 |
+
"TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0,
|
| 10 |
+
"TilingProfiler::AveragePeUtilizationAfterTiling": 0.0
|
| 11 |
+
}
|
| 12 |
+
},
|
| 13 |
+
"Count": {
|
| 14 |
+
"tensorizer": {
|
| 15 |
+
"StaticProfiler::AverageFractalPeUtilization": 1.0,
|
| 16 |
+
"StaticProfiler::AveragePartitionUtilization": 1.0,
|
| 17 |
+
"StaticProfiler::AveragePeUtilization": 1.0,
|
| 18 |
+
"StaticProfiler::LocalizationEfficiency": 1.0,
|
| 19 |
+
"StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 1.0,
|
| 20 |
+
"TilingProfiler::AveragePartitionUtilizationAfterTiling": 1.0,
|
| 21 |
+
"TilingProfiler::AveragePeUtilizationAfterTiling": 1.0
|
| 22 |
+
}
|
| 23 |
+
},
|
| 24 |
+
"Sum": {
|
| 25 |
+
"compiletime": {
|
| 26 |
+
"AGOrderingAnalysisPass": 0.057534217834472656,
|
| 27 |
+
"AffinePredicateResolution": 0.0009605884552001953,
|
| 28 |
+
"AliasDependencyElimination": 0.00025153160095214844,
|
| 29 |
+
"AliasDependencyInduction": 0.006276607513427734,
|
| 30 |
+
"AliasDependencyReset": 0.027743816375732422,
|
| 31 |
+
"BFComputeCutting": 0.0031321048736572266,
|
| 32 |
+
"BirCodeGenLoop": 0.5169932842254639,
|
| 33 |
+
"CCOpFusion": 0.05496716499328613,
|
| 34 |
+
"CanonicalizeConv": 1.8000000636675395e-05,
|
| 35 |
+
"CanonicalizeDAGForPGTiling": 0.010706663131713867,
|
| 36 |
+
"CanonicalizeForTensorizer": 3.7000001611886546e-05,
|
| 37 |
+
"CanonicalizeIR": 0.00154876708984375,
|
| 38 |
+
"Canonicalizer": 0.0007949999999254942,
|
| 39 |
+
"CoalesceCCOp": 0.0278623104095459,
|
| 40 |
+
"CommuteConcat": 0.001708984375,
|
| 41 |
+
"DMALocalityOpt": 0.010039329528808594,
|
| 42 |
+
"DMAProfiler": 0.031324148178100586,
|
| 43 |
+
"DMATilingProfiler": 0.011522531509399414,
|
| 44 |
+
"DataLocalityOpt": 0.28015780448913574,
|
| 45 |
+
"DataStreaming": 0.031224727630615234,
|
| 46 |
+
"DeConcat": 0.002462148666381836,
|
| 47 |
+
"DeadCodeElimination": 0.0021996498107910156,
|
| 48 |
+
"DeadStoreElimination": 0.007483243942260742,
|
| 49 |
+
"DelinearIndices": 0.008810281753540039,
|
| 50 |
+
"Delinearization": 0.009731292724609375,
|
| 51 |
+
"DelinearizeSPMD": 0.04425859451293945,
|
| 52 |
+
"DoNothing": 0.006867170333862305,
|
| 53 |
+
"DramToDramTranspose": 0.012907743453979492,
|
| 54 |
+
"DumpGraphAndMetadata": 0.07597684860229492,
|
| 55 |
+
"EliminateDivs": 0.0021903514862060547,
|
| 56 |
+
"ExpandBatchNorm": 0.001527547836303711,
|
| 57 |
+
"ExpandISAMacro": 0.024112701416015625,
|
| 58 |
+
"FactorizeBlkDims": 0.05227327346801758,
|
| 59 |
+
"FactorizeThreadAxesInFreeDims": 0.003031015396118164,
|
| 60 |
+
"FlattenMacroLoop": 0.004990577697753906,
|
| 61 |
+
"GenericAccessSimplifier": 0.0007598400115966797,
|
| 62 |
+
"HoistCompute": 1.2000000424450263e-05,
|
| 63 |
+
"IdentifyCrossPassTensors": 5.0000002374872565e-05,
|
| 64 |
+
"InferInitValue": 0.10130023956298828,
|
| 65 |
+
"InferIntrinsicOnCC": 0.007919549942016602,
|
| 66 |
+
"InferNeuronTensor": 0.05837249755859375,
|
| 67 |
+
"InferNonlocalTensors": 0.05706453323364258,
|
| 68 |
+
"InferPSumTensor": 0.06946349143981934,
|
| 69 |
+
"InferShardAxis": 0.4604020118713379,
|
| 70 |
+
"InferSharedMemLoc": 0.05161857604980469,
|
| 71 |
+
"InlineNativeKernels": 0.006569623947143555,
|
| 72 |
+
"InsertCoreBarrier": 0.018887758255004883,
|
| 73 |
+
"InsertIOTransposes": 0.0684211254119873,
|
| 74 |
+
"InsertImplicitShardAxisBeforeISel": 0.01549673080444336,
|
| 75 |
+
"InsertLocalTransposes": 0.022176742553710938,
|
| 76 |
+
"InsertOffloadedTransposes": 0.0181121826171875,
|
| 77 |
+
"LICM": 0.007555484771728516,
|
| 78 |
+
"LateLegalizeInst": 0.0287015438079834,
|
| 79 |
+
"LateLegalizePostSplit": 0.01993083953857422,
|
| 80 |
+
"LateLowerReshapeOp": 0.0016782283782958984,
|
| 81 |
+
"LateLowerTensorOp": 0.0021178722381591797,
|
| 82 |
+
"LateNeuronInstComb": 0.05098986625671387,
|
| 83 |
+
"LayoutPreprocessing": 0.10170960426330566,
|
| 84 |
+
"LayoutPreprocessingAndAnalysis": 0.23344039916992188,
|
| 85 |
+
"LayoutRequirementAnalysis": 0.032952308654785156,
|
| 86 |
+
"LegalizeCCOpLayout": 0.002583742141723633,
|
| 87 |
+
"LegalizeOpLevelAlias": 0.002170562744140625,
|
| 88 |
+
"LegalizePartitionReduce": 0.0025551319122314453,
|
| 89 |
+
"LegalizeSundaAccess": 0.1115577220916748,
|
| 90 |
+
"LegalizeSundaMacro": 0.04086017608642578,
|
| 91 |
+
"LegalizeType": 0.033699750900268555,
|
| 92 |
+
"LocalLayoutOpt": 0.023218154907226563,
|
| 93 |
+
"LoopFusion": 0.005990266799926758,
|
| 94 |
+
"LoopSplitting": 0.0007989406585693359,
|
| 95 |
+
"LowerBroadcast": 0.011745214462280273,
|
| 96 |
+
"LowerCCOpBlockAxis": 0.007201671600341797,
|
| 97 |
+
"LowerComplexBroadcast": 0.00890207290649414,
|
| 98 |
+
"LowerIntrinsics": 0.10557985305786133,
|
| 99 |
+
"LowerShardAxis": 0.023633956909179688,
|
| 100 |
+
"LowerTensorOp": 0.03027796745300293,
|
| 101 |
+
"LowerToSendRecv": 0.027859210968017578,
|
| 102 |
+
"LowerTranspose": 0.028818368911743164,
|
| 103 |
+
"MacroGeneration": 0.12761783599853516,
|
| 104 |
+
"MaskPropagation": 0.01400303840637207,
|
| 105 |
+
"MemcastMotion": 2.7999998565064743e-05,
|
| 106 |
+
"MemcpyElimination": 0.03596854209899902,
|
| 107 |
+
"MutateDataType": 0.0020971298217773438,
|
| 108 |
+
"NeuronAliasDependencyInduction": 0.0019202232360839844,
|
| 109 |
+
"NeuronAliasDependencyReset": 0.027405738830566406,
|
| 110 |
+
"NeuronInstComb": 0.048494815826416016,
|
| 111 |
+
"NeuronLICM": 0.052613019943237305,
|
| 112 |
+
"NeuronLoopFusion": 0.06255030632019043,
|
| 113 |
+
"NeuronLoopInterchange": 0.002681255340576172,
|
| 114 |
+
"NeuronSimplifier": 0.01907205581665039,
|
| 115 |
+
"NeuronSimplifyPredicates": 0.04273796081542969,
|
| 116 |
+
"NeuronValueNumbering": 0.019763708114624023,
|
| 117 |
+
"OptimizeAliasedCopyChain": 0.0005273818969726563,
|
| 118 |
+
"OptimizeNKIKernels": 4.391921043395996,
|
| 119 |
+
"PAGLayoutOpt": 0.16190624237060547,
|
| 120 |
+
"PComputeCutting": 0.016373872756958008,
|
| 121 |
+
"PGLayoutTilingPipeline": 2.0541465282440186,
|
| 122 |
+
"PGTiling": 0.3632845878601074,
|
| 123 |
+
"PadElimination": 0.0006501674652099609,
|
| 124 |
+
"ParAxesAnnotation": 0.08851456642150879,
|
| 125 |
+
"PartialLoopFusion": 0.05034661293029785,
|
| 126 |
+
"PartialSimdFusion": 0.014182329177856445,
|
| 127 |
+
"PenguinizeFunctions": 3.899999865097925e-05,
|
| 128 |
+
"PerfectLoopNest": 0.0036270618438720703,
|
| 129 |
+
"PruneFunctions": 3.7999998312443495e-05,
|
| 130 |
+
"RecognizeOpIdiom": 0.007064342498779297,
|
| 131 |
+
"Recompute": 0.00046062469482421875,
|
| 132 |
+
"RelaxPredicates": 0.02269601821899414,
|
| 133 |
+
"Rematerialization": 0.0019779205322265625,
|
| 134 |
+
"RemoveOptimizationBarriers": 4.400000034365803e-05,
|
| 135 |
+
"RemoveShardedPartitionAxes": 0.014830350875854492,
|
| 136 |
+
"ReshapeWeights": 0.0021474361419677734,
|
| 137 |
+
"ResolveAccessConflict": 0.007428646087646484,
|
| 138 |
+
"ResolveComplicatePredicates": 0.001834869384765625,
|
| 139 |
+
"RewriteReplicationMatmul": 0.006201982498168945,
|
| 140 |
+
"RewriteWeights": 0.004793643951416016,
|
| 141 |
+
"SFKVectorizer": 0.41699957847595215,
|
| 142 |
+
"ScatterMotion": 3.80000019504223e-05,
|
| 143 |
+
"ShardingPropagationAnalysis": 0.2801475524902344,
|
| 144 |
+
"SimpleAllReduceTiling": 0.025059938430786133,
|
| 145 |
+
"Simplifier": 0.003251314163208008,
|
| 146 |
+
"SimplifyMacroPredicates": 0.03280019760131836,
|
| 147 |
+
"SimplifyNeuronTensor": 0.14811110496520996,
|
| 148 |
+
"SimplifySlice": 0.0008628368377685547,
|
| 149 |
+
"SimplifyTensor": 0.014911413192749023,
|
| 150 |
+
"SpillPSum": 0.0687708854675293,
|
| 151 |
+
"SplitAPUnionSets": 0.09714126586914063,
|
| 152 |
+
"SplitAccGrp": 0.006166219711303711,
|
| 153 |
+
"StaticProfiler": 0.021403789520263672,
|
| 154 |
+
"StaticTransposeLocalTensor": 0.02319931983947754,
|
| 155 |
+
"SundaISel": 0.07143282890319824,
|
| 156 |
+
"TCTransform": 0.001344442367553711,
|
| 157 |
+
"TensorInitialization": 0.020877599716186523,
|
| 158 |
+
"TensorOpSimplifier": 0.0060787200927734375,
|
| 159 |
+
"TensorOpTransform": 0.03784608840942383,
|
| 160 |
+
"TensorizerLegalizationPass": 5.0000002374872565e-05,
|
| 161 |
+
"TileCCOps": 0.005100250244140625,
|
| 162 |
+
"TilingProfiler": 0.02941441535949707,
|
| 163 |
+
"TransformConvOp": 0.005896091461181641,
|
| 164 |
+
"TritiumFusion": 0.08978962898254395,
|
| 165 |
+
"ValueNumbering": 0.0032432079315185547,
|
| 166 |
+
"VectorizeDMA": 0.005987644195556641,
|
| 167 |
+
"VectorizeMatMult": 0.019278526306152344,
|
| 168 |
+
"VerifySupportedOps": 3.600000127335079e-05,
|
| 169 |
+
"WeightCoalescing": 0.014359712600708008,
|
| 170 |
+
"ZeroSizeTensorElimination": 0.00021028518676757813,
|
| 171 |
+
"algsimp": 0.001816999982111156,
|
| 172 |
+
"batchnorm_expander": 3.5000000934815034e-05,
|
| 173 |
+
"boundary-marker-removal": 1.2999998943996616e-05,
|
| 174 |
+
"call-inliner": 0.00031099998159334064,
|
| 175 |
+
"canonicalize-boundary-marker": 1.5999999959603883e-05,
|
| 176 |
+
"collective-stream-id-checker": 7.60000039008446e-05,
|
| 177 |
+
"comparison-expander": 0.0004780000017490238,
|
| 178 |
+
"computation-deduplicator": 5.699999746866524e-05,
|
| 179 |
+
"config-lowering": 0.00012000000424450263,
|
| 180 |
+
"constant-statistics": 0.00038899999344721437,
|
| 181 |
+
"constant_folding": 0.00016199999663513154,
|
| 182 |
+
"cse": 3.5000000934815034e-05,
|
| 183 |
+
"dce": 4.3000000005122274e-05,
|
| 184 |
+
"dot_decomposer": 0.0010089999996125698,
|
| 185 |
+
"dynamic-slice-transpose": 1.2000000424450263e-05,
|
| 186 |
+
"eliminate-redundant-compare": 0.00013299999409355223,
|
| 187 |
+
"emit-offloaded-dropout": 3.7000001611886546e-05,
|
| 188 |
+
"flatten-call-graph": 0.0008110000053420663,
|
| 189 |
+
"fuse-send-recv": 6.600000051548705e-05,
|
| 190 |
+
"hilo-conditional-to-select": 1.2999999853491317e-05,
|
| 191 |
+
"hilo::LegalizeAlias": 1.1000000085914508e-05,
|
| 192 |
+
"hilo::NeuronInstCombine": 0.00019799999427050352,
|
| 193 |
+
"hilo::NeuronOpFusion": 3.7000001611886546e-05,
|
| 194 |
+
"hilo::ReplaceTokenTypeWithU8Pass": 4.8000001697801054e-05,
|
| 195 |
+
"hilo::ScheduleFusion": 3.999999989900971e-06,
|
| 196 |
+
"hilo::SixtyFourHack": 6.800000119255856e-05,
|
| 197 |
+
"hilo::VerifyAliasing": 3.999999989900971e-06,
|
| 198 |
+
"hlo-mac-count": 0.012529000639915466,
|
| 199 |
+
"instruction-histogram": 0.0008679999737069011,
|
| 200 |
+
"io-con-pipe-begin": 6.000000212225132e-06,
|
| 201 |
+
"io-con-pipe-end": 9.999999974752427e-07,
|
| 202 |
+
"io-layout-normalization": 0.0010789999505504966,
|
| 203 |
+
"io-statistics": 3.899999865097925e-05,
|
| 204 |
+
"legalize-ccops-for-tensorizer": 3.000000106112566e-06,
|
| 205 |
+
"legalize-compare": 1.1000000085914508e-05,
|
| 206 |
+
"lower-argminmax-custom-call": 9.999999747378752e-06,
|
| 207 |
+
"map-inline": 0.000813000020571053,
|
| 208 |
+
"metadata-naming": 4.900000203633681e-05,
|
| 209 |
+
"mlir::detail::OpToOpPassAdaptor": 7.60000039008446e-05,
|
| 210 |
+
"mlir::hlo::MhloToPyPenguin": 0.008621999993920326,
|
| 211 |
+
"mlir::mhlo::LowerComplexExtraPass": 0.00021299999207258224,
|
| 212 |
+
"mlir::mhlo::LowerComplexPass": 0.0003549999964889139,
|
| 213 |
+
"native-to-custom-softmax": 0.00033000000985339284,
|
| 214 |
+
"native-to-custom-softmax-dx": 0.0016530000139027834,
|
| 215 |
+
"neuron-hlo-verifier": 0.011901999823749065,
|
| 216 |
+
"operand_upcaster": 5.299999611452222e-05,
|
| 217 |
+
"opt-barrier-removal": 0.0003209999995306134,
|
| 218 |
+
"post-par-pipe-begin": 0.0003220000071451068,
|
| 219 |
+
"post-par-pipe-end": 0.0,
|
| 220 |
+
"post-partition-simplification": 0.0015040000434964895,
|
| 221 |
+
"pre-par-pipe-begin": 9.999999974752427e-07,
|
| 222 |
+
"pre-par-pipe-end": 0.0,
|
| 223 |
+
"pre-partition-simplification": 0.06566499918699265,
|
| 224 |
+
"replace-minimum-constant": 0.0003129999968223274,
|
| 225 |
+
"reshape-mover": 6.000000212225132e-05,
|
| 226 |
+
"simplify-concat": 0.00011900000390596688,
|
| 227 |
+
"simplify-while-loops": 5.900000178371556e-05,
|
| 228 |
+
"transform-variadic-reduce": 6.399999983841553e-05,
|
| 229 |
+
"tuple-simplifier": 0.00015100000018719584,
|
| 230 |
+
"unpack-nested-aws-ntwsr": 0.00023299999884329736,
|
| 231 |
+
"unroll-while-loop": 9.000000318337698e-06,
|
| 232 |
+
"zero_sized_hlo_elimination": 0.0007510000141337514
|
| 233 |
+
},
|
| 234 |
+
"hilo": {
|
| 235 |
+
"ConstantSize": 1843839.0,
|
| 236 |
+
"HloInputCount": 371.0,
|
| 237 |
+
"HloMacCount": 53843722240.0,
|
| 238 |
+
"HloOutputCount": 57.0,
|
| 239 |
+
"IfmapSize": 3910920192.0,
|
| 240 |
+
"OfmapSize": 1879048192.0,
|
| 241 |
+
"OutputsReadFromCount": 0.0,
|
| 242 |
+
"PassthroughTensorsCount": 0.0,
|
| 243 |
+
"RedundantOutputCount": 0.0,
|
| 244 |
+
"Traffic": 915302528.0
|
| 245 |
+
},
|
| 246 |
+
"tensorizer": {
|
| 247 |
+
"DMATilingProfiler::TotalInstructionsAfterTiling": 22664.0,
|
| 248 |
+
"StaticProfiler::AifUb": 229.36119079589844,
|
| 249 |
+
"StaticProfiler::ArithmeticIntensityTensorizer": 194.92408752441406,
|
| 250 |
+
"StaticProfiler::AverageDmaLength": 2258.685546875,
|
| 251 |
+
"StaticProfiler::DDRTransferBytes": 420482080.0,
|
| 252 |
+
"StaticProfiler::InternalTransferBytes": 338614048.0,
|
| 253 |
+
"StaticProfiler::LoadExpanded": 118366.0,
|
| 254 |
+
"StaticProfiler::StoreExpanded": 4458.0,
|
| 255 |
+
"StaticProfiler::TotalDMAExpanded": 122824.0,
|
| 256 |
+
"StaticProfiler::TotalDynamicInstancesCount": 27423.0,
|
| 257 |
+
"StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 26972.0,
|
| 258 |
+
"StaticProfiler::TotalLNCComm": 0.0,
|
| 259 |
+
"StaticProfiler::TotalLNCCommTransfer": 0.0,
|
| 260 |
+
"TilingProfiler::BatchnormInstructionsAfterTiling": 0.0,
|
| 261 |
+
"TilingProfiler::DmaInstructionsAfterTiling": 0.0,
|
| 262 |
+
"TilingProfiler::GenericInstructionsAfterTiling": 4.0,
|
| 263 |
+
"TilingProfiler::MatMultInstructionsAfterTiling": 11808.0,
|
| 264 |
+
"TilingProfiler::NumPfTransposes": 5.0,
|
| 265 |
+
"TilingProfiler::NumPfTransposesForIo": 1.0,
|
| 266 |
+
"TilingProfiler::NumPfTransposesForLocal": 1.0,
|
| 267 |
+
"TilingProfiler::NumPfTransposesForNonlocal": 3.0,
|
| 268 |
+
"TilingProfiler::PfTransposeInstructions": 9889.0,
|
| 269 |
+
"TilingProfiler::PfTransposeInstructionsForIo": 9504.0,
|
| 270 |
+
"TilingProfiler::PfTransposeInstructionsForLocal": 1.0,
|
| 271 |
+
"TilingProfiler::PfTransposeInstructionsForNonlocal": 384.0,
|
| 272 |
+
"TilingProfiler::ReduceInstructionsAfterTiling": 6.0,
|
| 273 |
+
"TilingProfiler::SimdInstructionsAfterTiling": 165.0,
|
| 274 |
+
"TilingProfiler::TotalInstructionsAfterTiling": 0.0,
|
| 275 |
+
"TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0,
|
| 276 |
+
"TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0,
|
| 277 |
+
"TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0,
|
| 278 |
+
"TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0,
|
| 279 |
+
"TransformConvOp::conv2d_column_packing": 0.0,
|
| 280 |
+
"TransformConvOp::conv2d_column_packing_1": 0.0,
|
| 281 |
+
"TransformConvOp::conv2d_column_packing_io10": 0.0,
|
| 282 |
+
"TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0
|
| 283 |
+
}
|
| 284 |
+
},
|
| 285 |
+
"all": {
|
| 286 |
+
"compiletime": {
|
| 287 |
+
"algsimp": 0.0016659999964758754,
|
| 288 |
+
"call-inliner": 0.0002859999949578196,
|
| 289 |
+
"collective-stream-id-checker": 6.600000051548705e-05,
|
| 290 |
+
"comparison-expander": 0.00045900000259280205,
|
| 291 |
+
"constant-statistics": 0.00038899999344721437,
|
| 292 |
+
"constant_folding": 0.00014000000373926014,
|
| 293 |
+
"dce": 3.9999998989515007e-05,
|
| 294 |
+
"dot_decomposer": 0.0010089999996125698,
|
| 295 |
+
"eliminate-redundant-compare": 0.00012399999832268804,
|
| 296 |
+
"flatten-call-graph": 0.0007849999819882214,
|
| 297 |
+
"hlo-mac-count": 0.007579999975860119,
|
| 298 |
+
"instruction-histogram": 0.0008679999737069011,
|
| 299 |
+
"io-con-pipe-begin": 6.000000212225132e-06,
|
| 300 |
+
"io-con-pipe-end": 9.999999974752427e-07,
|
| 301 |
+
"io-layout-normalization": 0.0010789999505504966,
|
| 302 |
+
"io-statistics": 3.899999865097925e-05,
|
| 303 |
+
"map-inline": 0.0007789999945089221,
|
| 304 |
+
"native-to-custom-softmax": 0.000311999989207834,
|
| 305 |
+
"native-to-custom-softmax-dx": 0.00039400000241585076,
|
| 306 |
+
"neuron-hlo-verifier": 0.01071999967098236,
|
| 307 |
+
"opt-barrier-removal": 0.0003209999995306134,
|
| 308 |
+
"pre-par-pipe-begin": 9.999999974752427e-07,
|
| 309 |
+
"pre-par-pipe-end": 0.0,
|
| 310 |
+
"pre-partition-simplification": 0.06566499918699265,
|
| 311 |
+
"replace-minimum-constant": 0.00029399999766610563,
|
| 312 |
+
"reshape-mover": 5.199999941396527e-05,
|
| 313 |
+
"simplify-while-loops": 5.2999999752501026e-05,
|
| 314 |
+
"tuple-simplifier": 0.00013800000306218863,
|
| 315 |
+
"unpack-nested-aws-ntwsr": 0.0002209999947808683,
|
| 316 |
+
"unroll-while-loop": 9.000000318337698e-06,
|
| 317 |
+
"zero_sized_hlo_elimination": 0.0007510000141337514
|
| 318 |
+
}
|
| 319 |
+
},
|
| 320 |
+
"attention_isa_kernel": {
|
| 321 |
+
"compiletime": {
|
| 322 |
+
"CoalesceCCOp": 0.00023293495178222656,
|
| 323 |
+
"DMALocalityOpt": 0.0001811981201171875,
|
| 324 |
+
"DMAProfiler": 0.00021409988403320313,
|
| 325 |
+
"DataStreaming": 0.00021123886108398438,
|
| 326 |
+
"DoNothing": 0.00015926361083984375,
|
| 327 |
+
"ExpandISAMacro": 0.00025653839111328125,
|
| 328 |
+
"FactorizeBlkDims": 0.0004589557647705078,
|
| 329 |
+
"InferPSumTensor": 0.001004934310913086,
|
| 330 |
+
"InferSharedMemLoc": 0.0005850791931152344,
|
| 331 |
+
"InsertCoreBarrier": 0.00032901763916015625,
|
| 332 |
+
"LateLegalizeInst": 0.000202178955078125,
|
| 333 |
+
"LateNeuronInstComb": 0.000457763671875,
|
| 334 |
+
"LegalizeSundaAccess": 0.000244140625,
|
| 335 |
+
"LegalizeType": 0.00035119056701660156,
|
| 336 |
+
"LowerBroadcast": 0.0002529621124267578,
|
| 337 |
+
"LowerIntrinsics": 0.00025534629821777344,
|
| 338 |
+
"LowerTranspose": 0.00019860267639160156,
|
| 339 |
+
"NeuronInstComb": 0.0004410743713378906,
|
| 340 |
+
"NeuronLICM": 0.00022935867309570313,
|
| 341 |
+
"NeuronSimplifyPredicates": 0.00023698806762695313,
|
| 342 |
+
"NeuronValueNumbering": 0.00019621849060058594,
|
| 343 |
+
"SFKVectorizer": 0.0017054080963134766,
|
| 344 |
+
"SimpleAllReduceTiling": 0.00020575523376464844,
|
| 345 |
+
"SimplifyNeuronTensor": 0.00058746337890625,
|
| 346 |
+
"SpillPSum": 0.0008275508880615234,
|
| 347 |
+
"WeightCoalescing": 0.0002827644348144531
|
| 348 |
+
}
|
| 349 |
+
},
|
| 350 |
+
"cumsum": {
|
| 351 |
+
"compiletime": {
|
| 352 |
+
"CoalesceCCOp": 0.0004239082336425781,
|
| 353 |
+
"DMALocalityOpt": 0.0008606910705566406,
|
| 354 |
+
"DMAProfiler": 0.0012273788452148438,
|
| 355 |
+
"DataStreaming": 0.0004677772521972656,
|
| 356 |
+
"DoNothing": 0.0020771026611328125,
|
| 357 |
+
"ExpandISAMacro": 0.0009121894836425781,
|
| 358 |
+
"FactorizeBlkDims": 0.0007412433624267578,
|
| 359 |
+
"InferPSumTensor": 0.0011811256408691406,
|
| 360 |
+
"InferSharedMemLoc": 0.00045990943908691406,
|
| 361 |
+
"InsertCoreBarrier": 0.00042891502380371094,
|
| 362 |
+
"LateLegalizeInst": 0.00063323974609375,
|
| 363 |
+
"LateNeuronInstComb": 0.0013093948364257813,
|
| 364 |
+
"LegalizeSundaAccess": 0.0025353431701660156,
|
| 365 |
+
"LegalizeType": 0.001573801040649414,
|
| 366 |
+
"LowerBroadcast": 0.0004336833953857422,
|
| 367 |
+
"LowerIntrinsics": 0.0003495216369628906,
|
| 368 |
+
"LowerTranspose": 0.00044226646423339844,
|
| 369 |
+
"NeuronInstComb": 0.007911205291748047,
|
| 370 |
+
"NeuronLICM": 0.0006246566772460938,
|
| 371 |
+
"NeuronSimplifyPredicates": 0.006840705871582031,
|
| 372 |
+
"NeuronValueNumbering": 0.0007255077362060547,
|
| 373 |
+
"SFKVectorizer": 0.008939266204833984,
|
| 374 |
+
"SimpleAllReduceTiling": 0.0003476142883300781,
|
| 375 |
+
"SimplifyNeuronTensor": 0.0009677410125732422,
|
| 376 |
+
"SpillPSum": 0.0031452178955078125,
|
| 377 |
+
"WeightCoalescing": 0.000408172607421875
|
| 378 |
+
}
|
| 379 |
+
},
|
| 380 |
+
"sg00": {
|
| 381 |
+
"compiletime": {
|
| 382 |
+
"CanonicalizeConv": 0.0,
|
| 383 |
+
"CanonicalizeForTensorizer": 1.4000000192027073e-05,
|
| 384 |
+
"Canonicalizer": 0.0002680000034160912,
|
| 385 |
+
"HoistCompute": 3.000000106112566e-06,
|
| 386 |
+
"IdentifyCrossPassTensors": 1.4000000192027073e-05,
|
| 387 |
+
"MemcastMotion": 9.000000318337698e-06,
|
| 388 |
+
"PenguinizeFunctions": 1.4999999621068127e-05,
|
| 389 |
+
"PruneFunctions": 1.2999999853491317e-05,
|
| 390 |
+
"RemoveOptimizationBarriers": 7.000000096013537e-06,
|
| 391 |
+
"ScatterMotion": 1.8000000636675395e-05,
|
| 392 |
+
"TensorizerLegalizationPass": 2.700000004551839e-05,
|
| 393 |
+
"VerifySupportedOps": 1.4000000192027073e-05,
|
| 394 |
+
"algsimp": 4.8000001697801054e-05,
|
| 395 |
+
"batchnorm_expander": 1.2999999853491317e-05,
|
| 396 |
+
"boundary-marker-removal": 3.999999989900971e-06,
|
| 397 |
+
"call-inliner": 7.000000096013537e-06,
|
| 398 |
+
"canonicalize-boundary-marker": 4.999999873689376e-06,
|
| 399 |
+
"collective-stream-id-checker": 3.999999989900971e-06,
|
| 400 |
+
"comparison-expander": 7.000000096013537e-06,
|
| 401 |
+
"computation-deduplicator": 1.700000029813964e-05,
|
| 402 |
+
"config-lowering": 3.899999865097925e-05,
|
| 403 |
+
"constant_folding": 7.000000096013537e-06,
|
| 404 |
+
"cse": 1.1000000085914508e-05,
|
| 405 |
+
"dce": 9.999999974752427e-07,
|
| 406 |
+
"dynamic-slice-transpose": 3.999999989900971e-06,
|
| 407 |
+
"eliminate-redundant-compare": 3.000000106112566e-06,
|
| 408 |
+
"emit-offloaded-dropout": 1.2000000424450263e-05,
|
| 409 |
+
"flatten-call-graph": 7.999999979801942e-06,
|
| 410 |
+
"fuse-send-recv": 2.8000000384054147e-05,
|
| 411 |
+
"hilo-conditional-to-select": 3.999999989900971e-06,
|
| 412 |
+
"hilo::LegalizeAlias": 4.999999873689376e-06,
|
| 413 |
+
"hilo::NeuronInstCombine": 7.79999973019585e-05,
|
| 414 |
+
"hilo::NeuronOpFusion": 1.4000000192027073e-05,
|
| 415 |
+
"hilo::ReplaceTokenTypeWithU8Pass": 1.8999999156221747e-05,
|
| 416 |
+
"hilo::ScheduleFusion": 9.999999974752427e-07,
|
| 417 |
+
"hilo::SixtyFourHack": 1.1000000085914508e-05,
|
| 418 |
+
"hilo::VerifyAliasing": 1.9999999949504854e-06,
|
| 419 |
+
"hlo-mac-count": 9.999999747378752e-05,
|
| 420 |
+
"legalize-ccops-for-tensorizer": 9.999999974752427e-07,
|
| 421 |
+
"legalize-compare": 3.999999989900971e-06,
|
| 422 |
+
"lower-argminmax-custom-call": 3.000000106112566e-06,
|
| 423 |
+
"map-inline": 1.2000000424450263e-05,
|
| 424 |
+
"metadata-naming": 1.4000000192027073e-05,
|
| 425 |
+
"mlir::detail::OpToOpPassAdaptor": 2.099999983329326e-05,
|
| 426 |
+
"mlir::hlo::MhloToPyPenguin": 0.001617999980226159,
|
| 427 |
+
"mlir::mhlo::LowerComplexExtraPass": 6.70000008540228e-05,
|
| 428 |
+
"mlir::mhlo::LowerComplexPass": 0.00011800000356743112,
|
| 429 |
+
"native-to-custom-softmax": 7.000000096013537e-06,
|
| 430 |
+
"native-to-custom-softmax-dx": 0.001218999968841672,
|
| 431 |
+
"neuron-hlo-verifier": 0.0004619999963324517,
|
| 432 |
+
"operand_upcaster": 2.099999983329326e-05,
|
| 433 |
+
"post-par-pipe-begin": 0.00031800000579096377,
|
| 434 |
+
"post-par-pipe-end": 0.0,
|
| 435 |
+
"post-partition-simplification": 0.00047400000039488077,
|
| 436 |
+
"replace-minimum-constant": 6.000000212225132e-06,
|
| 437 |
+
"reshape-mover": 3.000000106112566e-06,
|
| 438 |
+
"simplify-concat": 3.7000001611886546e-05,
|
| 439 |
+
"simplify-while-loops": 1.9999999949504854e-06,
|
| 440 |
+
"transform-variadic-reduce": 7.999999979801942e-06,
|
| 441 |
+
"tuple-simplifier": 3.999999989900971e-06,
|
| 442 |
+
"unpack-nested-aws-ntwsr": 3.999999989900971e-06,
|
| 443 |
+
"unroll-while-loop": 0.0
|
| 444 |
+
},
|
| 445 |
+
"hilo": {
|
| 446 |
+
"ArithmeticIntensity": 36.6374397277832,
|
| 447 |
+
"ConstantSize": 1843839.0,
|
| 448 |
+
"HloInputCount": 371.0,
|
| 449 |
+
"HloMacCount": 7516192768.0,
|
| 450 |
+
"HloOutputCount": 57.0,
|
| 451 |
+
"IfmapSize": 3910920192.0,
|
| 452 |
+
"OfmapSize": 1879048192.0,
|
| 453 |
+
"OutputsReadFromCount": 0.0,
|
| 454 |
+
"PassthroughTensorsCount": 0.0,
|
| 455 |
+
"RedundantOutputCount": 0.0,
|
| 456 |
+
"Traffic": 410301216.0
|
| 457 |
+
}
|
| 458 |
+
},
|
| 459 |
+
"sg0000": {
|
| 460 |
+
"compiletime": {
|
| 461 |
+
"AGOrderingAnalysisPass": 0.10170578956604004,
|
| 462 |
+
"AffinePredicateResolution": 0.002114534378051758,
|
| 463 |
+
"AliasDependencyElimination": 0.0003135204315185547,
|
| 464 |
+
"AliasDependencyInduction": 0.008873462677001953,
|
| 465 |
+
"AliasDependencyReset": 0.08848953247070313,
|
| 466 |
+
"BFComputeCutting": 0.0046901702880859375,
|
| 467 |
+
"BirCodeGenLoop": 0.07164216041564941,
|
| 468 |
+
"CCOpFusion": 0.03796195983886719,
|
| 469 |
+
"CanonicalizeDAGForPGTiling": 0.004980564117431641,
|
| 470 |
+
"CanonicalizeIR": 0.0069043636322021484,
|
| 471 |
+
"CoalesceCCOp": 0.025182723999023438,
|
| 472 |
+
"CommuteConcat": 0.0019867420196533203,
|
| 473 |
+
"DMALocalityOpt": 0.0017561912536621094,
|
| 474 |
+
"DMAProfiler": 0.015140295028686523,
|
| 475 |
+
"DMATilingProfiler": 0.016626596450805664,
|
| 476 |
+
"DataLocalityOpt": 0.22760343551635742,
|
| 477 |
+
"DataStreaming": 0.010300159454345703,
|
| 478 |
+
"DeConcat": 0.0027208328247070313,
|
| 479 |
+
"DeadCodeElimination": 0.0024912357330322266,
|
| 480 |
+
"DeadStoreElimination": 0.0712437629699707,
|
| 481 |
+
"DelinearIndices": 0.016620635986328125,
|
| 482 |
+
"Delinearization": 0.009757280349731445,
|
| 483 |
+
"DelinearizeSPMD": 0.031106233596801758,
|
| 484 |
+
"DoNothing": 0.00010442733764648438,
|
| 485 |
+
"DramToDramTranspose": 0.015790462493896484,
|
| 486 |
+
"DumpGraphAndMetadata": 0.009348392486572266,
|
| 487 |
+
"EliminateDivs": 0.0055081844329833984,
|
| 488 |
+
"ExpandBatchNorm": 0.002715587615966797,
|
| 489 |
+
"ExpandISAMacro": 0.006904125213623047,
|
| 490 |
+
"FactorizeBlkDims": 0.02294635772705078,
|
| 491 |
+
"FactorizeThreadAxesInFreeDims": 0.004876136779785156,
|
| 492 |
+
"FlattenMacroLoop": 0.014545440673828125,
|
| 493 |
+
"GenericAccessSimplifier": 0.0014882087707519531,
|
| 494 |
+
"InferInitValue": 0.07265543937683105,
|
| 495 |
+
"InferIntrinsicOnCC": 0.016221046447753906,
|
| 496 |
+
"InferNeuronTensor": 0.06634330749511719,
|
| 497 |
+
"InferNonlocalTensors": 0.310718297958374,
|
| 498 |
+
"InferPSumTensor": 0.1104276180267334,
|
| 499 |
+
"InferShardAxis": 0.6379494667053223,
|
| 500 |
+
"InferSharedMemLoc": 0.007468461990356445,
|
| 501 |
+
"InlineNativeKernels": 0.008686304092407227,
|
| 502 |
+
"InsertCoreBarrier": 0.013060331344604492,
|
| 503 |
+
"InsertIOTransposes": 0.0500941276550293,
|
| 504 |
+
"InsertImplicitShardAxisBeforeISel": 0.013952255249023438,
|
| 505 |
+
"InsertLocalTransposes": 0.011726140975952148,
|
| 506 |
+
"InsertOffloadedTransposes": 0.015027046203613281,
|
| 507 |
+
"LICM": 0.009333610534667969,
|
| 508 |
+
"LateLegalizeInst": 0.02084517478942871,
|
| 509 |
+
"LateLegalizePostSplit": 0.006055116653442383,
|
| 510 |
+
"LateLowerReshapeOp": 0.0010623931884765625,
|
| 511 |
+
"LateLowerTensorOp": 0.005917787551879883,
|
| 512 |
+
"LateNeuronInstComb": 0.0374608039855957,
|
| 513 |
+
"LayoutPreprocessing": 0.11253118515014648,
|
| 514 |
+
"LayoutPreprocessingAndAnalysis": 0.17174959182739258,
|
| 515 |
+
"LayoutRequirementAnalysis": 0.01859116554260254,
|
| 516 |
+
"LegalizeCCOpLayout": 0.008987903594970703,
|
| 517 |
+
"LegalizeOpLevelAlias": 0.0018634796142578125,
|
| 518 |
+
"LegalizePartitionReduce": 0.0028128623962402344,
|
| 519 |
+
"LegalizeSundaAccess": 0.0760490894317627,
|
| 520 |
+
"LegalizeSundaMacro": 0.04249215126037598,
|
| 521 |
+
"LegalizeType": 0.017363786697387695,
|
| 522 |
+
"LocalLayoutOpt": 0.030303478240966797,
|
| 523 |
+
"LoopFusion": 0.015121221542358398,
|
| 524 |
+
"LoopSplitting": 0.001684427261352539,
|
| 525 |
+
"LowerBroadcast": 0.004286289215087891,
|
| 526 |
+
"LowerCCOpBlockAxis": 0.011670112609863281,
|
| 527 |
+
"LowerComplexBroadcast": 0.009485006332397461,
|
| 528 |
+
"LowerIntrinsics": 0.06814241409301758,
|
| 529 |
+
"LowerShardAxis": 0.01289224624633789,
|
| 530 |
+
"LowerTensorOp": 0.012324810028076172,
|
| 531 |
+
"LowerToSendRecv": 0.01944112777709961,
|
| 532 |
+
"LowerTranspose": 0.024444580078125,
|
| 533 |
+
"MacroGeneration": 0.12030863761901855,
|
| 534 |
+
"MaskPropagation": 0.0041234493255615234,
|
| 535 |
+
"MemcpyElimination": 0.11655545234680176,
|
| 536 |
+
"MutateDataType": 0.006365299224853516,
|
| 537 |
+
"NeuronAliasDependencyInduction": 0.0008358955383300781,
|
| 538 |
+
"NeuronAliasDependencyReset": 0.0208890438079834,
|
| 539 |
+
"NeuronInstComb": 0.012987852096557617,
|
| 540 |
+
"NeuronLICM": 0.03186321258544922,
|
| 541 |
+
"NeuronLoopFusion": 0.039856910705566406,
|
| 542 |
+
"NeuronLoopInterchange": 0.0034656524658203125,
|
| 543 |
+
"NeuronSimplifier": 0.04315042495727539,
|
| 544 |
+
"NeuronSimplifyPredicates": 0.005248546600341797,
|
| 545 |
+
"NeuronValueNumbering": 0.017512798309326172,
|
| 546 |
+
"OptimizeAliasedCopyChain": 0.0023038387298583984,
|
| 547 |
+
"OptimizeNKIKernels": 0.3315870761871338,
|
| 548 |
+
"PAGLayoutOpt": 0.6959309577941895,
|
| 549 |
+
"PComputeCutting": 0.02900981903076172,
|
| 550 |
+
"PGLayoutTilingPipeline": 2.8589253425598145,
|
| 551 |
+
"PGTiling": 0.4929697513580322,
|
| 552 |
+
"PadElimination": 0.0008306503295898438,
|
| 553 |
+
"ParAxesAnnotation": 0.6449503898620605,
|
| 554 |
+
"PartialLoopFusion": 0.04073286056518555,
|
| 555 |
+
"PartialSimdFusion": 0.04506206512451172,
|
| 556 |
+
"PerfectLoopNest": 0.003442049026489258,
|
| 557 |
+
"RecognizeOpIdiom": 0.01386570930480957,
|
| 558 |
+
"Recompute": 0.0005090236663818359,
|
| 559 |
+
"RelaxPredicates": 0.007751941680908203,
|
| 560 |
+
"Rematerialization": 0.0035130977630615234,
|
| 561 |
+
"RemoveShardedPartitionAxes": 0.042932987213134766,
|
| 562 |
+
"ReshapeWeights": 0.005467653274536133,
|
| 563 |
+
"ResolveAccessConflict": 0.007354259490966797,
|
| 564 |
+
"ResolveComplicatePredicates": 0.0022590160369873047,
|
| 565 |
+
"RewriteReplicationMatmul": 0.0024857521057128906,
|
| 566 |
+
"RewriteWeights": 0.007905960083007813,
|
| 567 |
+
"SFKVectorizer": 0.45865941047668457,
|
| 568 |
+
"ShardingPropagationAnalysis": 0.015976905822753906,
|
| 569 |
+
"SimpleAllReduceTiling": 0.004487752914428711,
|
| 570 |
+
"Simplifier": 0.01264333724975586,
|
| 571 |
+
"SimplifyMacroPredicates": 0.010998964309692383,
|
| 572 |
+
"SimplifyNeuronTensor": 0.020704269409179688,
|
| 573 |
+
"SimplifySlice": 0.0029506683349609375,
|
| 574 |
+
"SimplifyTensor": 0.024234533309936523,
|
| 575 |
+
"SpillPSum": 0.03745222091674805,
|
| 576 |
+
"SplitAPUnionSets": 0.0402374267578125,
|
| 577 |
+
"SplitAccGrp": 0.0030994415283203125,
|
| 578 |
+
"StaticProfiler": 0.007781982421875,
|
| 579 |
+
"StaticTransposeLocalTensor": 0.015400409698486328,
|
| 580 |
+
"SundaISel": 0.15909790992736816,
|
| 581 |
+
"TCTransform": 0.0024313926696777344,
|
| 582 |
+
"TensorInitialization": 0.00689244270324707,
|
| 583 |
+
"TensorOpSimplifier": 0.009465932846069336,
|
| 584 |
+
"TensorOpTransform": 0.05043935775756836,
|
| 585 |
+
"TileCCOps": 0.01146245002746582,
|
| 586 |
+
"TilingProfiler": 0.030185699462890625,
|
| 587 |
+
"TransformConvOp": 0.003003835678100586,
|
| 588 |
+
"TritiumFusion": 0.07740235328674316,
|
| 589 |
+
"ValueNumbering": 0.006630659103393555,
|
| 590 |
+
"VectorizeDMA": 0.006995201110839844,
|
| 591 |
+
"VectorizeMatMult": 0.019536495208740234,
|
| 592 |
+
"WeightCoalescing": 0.007775783538818359,
|
| 593 |
+
"ZeroSizeTensorElimination": 0.0001773834228515625
|
| 594 |
+
},
|
| 595 |
+
"tensorizer": {
|
| 596 |
+
"DMATilingProfiler::TotalInstructionsAfterTiling": 9885.0,
|
| 597 |
+
"StaticProfiler::AifUb": 33.7130126953125,
|
| 598 |
+
"StaticProfiler::ArithmeticIntensityTensorizer": 285.20709228515625,
|
| 599 |
+
"StaticProfiler::AverageDmaLength": 1479.2880859375,
|
| 600 |
+
"StaticProfiler::AverageFractalPeUtilization": 99.77941131591797,
|
| 601 |
+
"StaticProfiler::AveragePartitionUtilization": 99.22618865966797,
|
| 602 |
+
"StaticProfiler::AveragePeUtilization": 99.2345962524414,
|
| 603 |
+
"StaticProfiler::DDRTransferBytes": 55208456.0,
|
| 604 |
+
"StaticProfiler::InternalTransferBytes": 47980544.0,
|
| 605 |
+
"StaticProfiler::LoadExpanded": 15885.0,
|
| 606 |
+
"StaticProfiler::LocalizationEfficiency": 845.9852294921875,
|
| 607 |
+
"StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 1554.64208984375,
|
| 608 |
+
"StaticProfiler::StoreExpanded": 10241.0,
|
| 609 |
+
"StaticProfiler::TotalDMAExpanded": 26126.0,
|
| 610 |
+
"StaticProfiler::TotalDynamicInstancesCount": 2424.0,
|
| 611 |
+
"StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 2417.0,
|
| 612 |
+
"StaticProfiler::TotalLNCComm": 0.0,
|
| 613 |
+
"StaticProfiler::TotalLNCCommTransfer": 0.0,
|
| 614 |
+
"TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0,
|
| 615 |
+
"TilingProfiler::AveragePeUtilizationAfterTiling": 0.0,
|
| 616 |
+
"TilingProfiler::BatchnormInstructionsAfterTiling": 0.0,
|
| 617 |
+
"TilingProfiler::DmaInstructionsAfterTiling": 0.0,
|
| 618 |
+
"TilingProfiler::GenericInstructionsAfterTiling": 80.0,
|
| 619 |
+
"TilingProfiler::MatMultInstructionsAfterTiling": 776.0,
|
| 620 |
+
"TilingProfiler::NumPfTransposes": 7.0,
|
| 621 |
+
"TilingProfiler::NumPfTransposesForIo": 0.0,
|
| 622 |
+
"TilingProfiler::NumPfTransposesForLocal": 3.0,
|
| 623 |
+
"TilingProfiler::NumPfTransposesForNonlocal": 4.0,
|
| 624 |
+
"TilingProfiler::PfTransposeInstructions": 448.0,
|
| 625 |
+
"TilingProfiler::PfTransposeInstructionsForIo": 0.0,
|
| 626 |
+
"TilingProfiler::PfTransposeInstructionsForLocal": 128.0,
|
| 627 |
+
"TilingProfiler::PfTransposeInstructionsForNonlocal": 320.0,
|
| 628 |
+
"TilingProfiler::ReduceInstructionsAfterTiling": 0.0,
|
| 629 |
+
"TilingProfiler::SimdInstructionsAfterTiling": 236.0,
|
| 630 |
+
"TilingProfiler::TotalInstructionsAfterTiling": 0.0,
|
| 631 |
+
"TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0,
|
| 632 |
+
"TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0,
|
| 633 |
+
"TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0,
|
| 634 |
+
"TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0,
|
| 635 |
+
"TransformConvOp::conv2d_column_packing": 0.0,
|
| 636 |
+
"TransformConvOp::conv2d_column_packing_1": 0.0,
|
| 637 |
+
"TransformConvOp::conv2d_column_packing_io10": 0.0,
|
| 638 |
+
"TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0
|
| 639 |
+
}
|
| 640 |
+
},
|
| 641 |
+
"sg0001": {
|
| 642 |
+
"compiletime": {
|
| 643 |
+
"AGOrderingAnalysisPass": 0.11948776245117188,
|
| 644 |
+
"AffinePredicateResolution": 0.0018799304962158203,
|
| 645 |
+
"AliasDependencyElimination": 0.00021576881408691406,
|
| 646 |
+
"AliasDependencyInduction": 0.007300615310668945,
|
| 647 |
+
"AliasDependencyReset": 0.025965213775634766,
|
| 648 |
+
"BFComputeCutting": 0.0029859542846679688,
|
| 649 |
+
"BirCodeGenLoop": 0.0455019474029541,
|
| 650 |
+
"CCOpFusion": 0.04734611511230469,
|
| 651 |
+
"CanonicalizeDAGForPGTiling": 0.022237777709960938,
|
| 652 |
+
"CanonicalizeIR": 0.002727985382080078,
|
| 653 |
+
"CoalesceCCOp": 0.02167034149169922,
|
| 654 |
+
"CommuteConcat": 0.003200054168701172,
|
| 655 |
+
"DMALocalityOpt": 0.00392460823059082,
|
| 656 |
+
"DMAProfiler": 0.009830236434936523,
|
| 657 |
+
"DMATilingProfiler": 0.025944948196411133,
|
| 658 |
+
"DataLocalityOpt": 0.3604612350463867,
|
| 659 |
+
"DataStreaming": 0.009065628051757813,
|
| 660 |
+
"DeConcat": 0.0069577693939208984,
|
| 661 |
+
"DeadCodeElimination": 0.011698722839355469,
|
| 662 |
+
"DeadStoreElimination": 0.06011176109313965,
|
| 663 |
+
"DelinearIndices": 0.020532608032226563,
|
| 664 |
+
"Delinearization": 0.00762939453125,
|
| 665 |
+
"DelinearizeSPMD": 0.03405618667602539,
|
| 666 |
+
"DoNothing": 8.106231689453125e-05,
|
| 667 |
+
"DramToDramTranspose": 0.01855611801147461,
|
| 668 |
+
"DumpGraphAndMetadata": 0.008964061737060547,
|
| 669 |
+
"EliminateDivs": 0.0031299591064453125,
|
| 670 |
+
"ExpandBatchNorm": 0.0030705928802490234,
|
| 671 |
+
"ExpandISAMacro": 0.006265163421630859,
|
| 672 |
+
"FactorizeBlkDims": 0.03638315200805664,
|
| 673 |
+
"FactorizeThreadAxesInFreeDims": 0.008359670639038086,
|
| 674 |
+
"FlattenMacroLoop": 0.012061595916748047,
|
| 675 |
+
"GenericAccessSimplifier": 0.0030562877655029297,
|
| 676 |
+
"InferInitValue": 0.08994674682617188,
|
| 677 |
+
"InferIntrinsicOnCC": 0.024573802947998047,
|
| 678 |
+
"InferNeuronTensor": 0.1031036376953125,
|
| 679 |
+
"InferNonlocalTensors": 0.05871725082397461,
|
| 680 |
+
"InferPSumTensor": 0.06618380546569824,
|
| 681 |
+
"InferShardAxis": 0.7525274753570557,
|
| 682 |
+
"InferSharedMemLoc": 0.0068051815032958984,
|
| 683 |
+
"InlineNativeKernels": 0.005843400955200195,
|
| 684 |
+
"InsertCoreBarrier": 0.008070230484008789,
|
| 685 |
+
"InsertIOTransposes": 0.04006528854370117,
|
| 686 |
+
"InsertImplicitShardAxisBeforeISel": 0.01073002815246582,
|
| 687 |
+
"InsertLocalTransposes": 0.014261007308959961,
|
| 688 |
+
"InsertOffloadedTransposes": 0.03949117660522461,
|
| 689 |
+
"LICM": 0.009208917617797852,
|
| 690 |
+
"LateLegalizeInst": 0.029766082763671875,
|
| 691 |
+
"LateLegalizePostSplit": 0.005662679672241211,
|
| 692 |
+
"LateLowerReshapeOp": 0.0074732303619384766,
|
| 693 |
+
"LateLowerTensorOp": 0.003675222396850586,
|
| 694 |
+
"LateNeuronInstComb": 0.010900020599365234,
|
| 695 |
+
"LayoutPreprocessing": 0.12459802627563477,
|
| 696 |
+
"LayoutPreprocessingAndAnalysis": 0.2370927333831787,
|
| 697 |
+
"LayoutRequirementAnalysis": 0.02673649787902832,
|
| 698 |
+
"LegalizeCCOpLayout": 0.001771688461303711,
|
| 699 |
+
"LegalizeOpLevelAlias": 0.001964569091796875,
|
| 700 |
+
"LegalizePartitionReduce": 0.0026857852935791016,
|
| 701 |
+
"LegalizeSundaAccess": 0.024449825286865234,
|
| 702 |
+
"LegalizeSundaMacro": 0.031160593032836914,
|
| 703 |
+
"LegalizeType": 0.01265263557434082,
|
| 704 |
+
"LocalLayoutOpt": 0.13158392906188965,
|
| 705 |
+
"LoopFusion": 0.008500337600708008,
|
| 706 |
+
"LoopSplitting": 0.007683753967285156,
|
| 707 |
+
"LowerBroadcast": 0.0029337406158447266,
|
| 708 |
+
"LowerCCOpBlockAxis": 0.019019126892089844,
|
| 709 |
+
"LowerComplexBroadcast": 0.0050733089447021484,
|
| 710 |
+
"LowerIntrinsics": 0.045258283615112305,
|
| 711 |
+
"LowerShardAxis": 0.010171175003051758,
|
| 712 |
+
"LowerTensorOp": 0.04014849662780762,
|
| 713 |
+
"LowerToSendRecv": 0.006317615509033203,
|
| 714 |
+
"LowerTranspose": 0.02257823944091797,
|
| 715 |
+
"MacroGeneration": 0.1289076805114746,
|
| 716 |
+
"MaskPropagation": 0.007184505462646484,
|
| 717 |
+
"MemcpyElimination": 0.13024330139160156,
|
| 718 |
+
"MutateDataType": 0.0023887157440185547,
|
| 719 |
+
"NeuronAliasDependencyInduction": 0.0008273124694824219,
|
| 720 |
+
"NeuronAliasDependencyReset": 0.023006439208984375,
|
| 721 |
+
"NeuronInstComb": 0.02357006072998047,
|
| 722 |
+
"NeuronLICM": 0.016632556915283203,
|
| 723 |
+
"NeuronLoopFusion": 0.05176591873168945,
|
| 724 |
+
"NeuronLoopInterchange": 0.003633737564086914,
|
| 725 |
+
"NeuronSimplifier": 0.055544376373291016,
|
| 726 |
+
"NeuronSimplifyPredicates": 0.0042285919189453125,
|
| 727 |
+
"NeuronValueNumbering": 0.007681369781494141,
|
| 728 |
+
"OptimizeAliasedCopyChain": 0.0018992424011230469,
|
| 729 |
+
"OptimizeNKIKernels": 0.42712831497192383,
|
| 730 |
+
"PAGLayoutOpt": 0.40447092056274414,
|
| 731 |
+
"PComputeCutting": 0.02052617073059082,
|
| 732 |
+
"PGLayoutTilingPipeline": 2.5240347385406494,
|
| 733 |
+
"PGTiling": 0.4373018741607666,
|
| 734 |
+
"PadElimination": 0.0004992485046386719,
|
| 735 |
+
"ParAxesAnnotation": 0.3364219665527344,
|
| 736 |
+
"PartialLoopFusion": 0.04578566551208496,
|
| 737 |
+
"PartialSimdFusion": 0.07974457740783691,
|
| 738 |
+
"PerfectLoopNest": 0.006705045700073242,
|
| 739 |
+
"RecognizeOpIdiom": 0.007408857345581055,
|
| 740 |
+
"Recompute": 0.0003921985626220703,
|
| 741 |
+
"RelaxPredicates": 0.004956483840942383,
|
| 742 |
+
"Rematerialization": 0.00407719612121582,
|
| 743 |
+
"RemoveShardedPartitionAxes": 0.03296494483947754,
|
| 744 |
+
"ReshapeWeights": 0.0016734600067138672,
|
| 745 |
+
"ResolveAccessConflict": 0.005868196487426758,
|
| 746 |
+
"ResolveComplicatePredicates": 0.0019488334655761719,
|
| 747 |
+
"RewriteReplicationMatmul": 0.002888917922973633,
|
| 748 |
+
"RewriteWeights": 0.0121307373046875,
|
| 749 |
+
"SFKVectorizer": 0.3227095603942871,
|
| 750 |
+
"ShardingPropagationAnalysis": 0.030770540237426758,
|
| 751 |
+
"SimpleAllReduceTiling": 0.005700588226318359,
|
| 752 |
+
"Simplifier": 0.006751298904418945,
|
| 753 |
+
"SimplifyMacroPredicates": 0.0224151611328125,
|
| 754 |
+
"SimplifyNeuronTensor": 0.026612043380737305,
|
| 755 |
+
"SimplifySlice": 0.0016014575958251953,
|
| 756 |
+
"SimplifyTensor": 0.014640331268310547,
|
| 757 |
+
"SpillPSum": 0.03543543815612793,
|
| 758 |
+
"SplitAPUnionSets": 0.04225468635559082,
|
| 759 |
+
"SplitAccGrp": 0.0025916099548339844,
|
| 760 |
+
"StaticProfiler": 0.004286527633666992,
|
| 761 |
+
"StaticTransposeLocalTensor": 0.01450037956237793,
|
| 762 |
+
"SundaISel": 0.09066033363342285,
|
| 763 |
+
"TCTransform": 0.001735687255859375,
|
| 764 |
+
"TensorInitialization": 0.005040168762207031,
|
| 765 |
+
"TensorOpSimplifier": 0.009763479232788086,
|
| 766 |
+
"TensorOpTransform": 0.037050485610961914,
|
| 767 |
+
"TileCCOps": 0.007235288619995117,
|
| 768 |
+
"TilingProfiler": 0.022336721420288086,
|
| 769 |
+
"TransformConvOp": 0.003210783004760742,
|
| 770 |
+
"TritiumFusion": 0.1834256649017334,
|
| 771 |
+
"ValueNumbering": 0.007995128631591797,
|
| 772 |
+
"VectorizeDMA": 0.009528160095214844,
|
| 773 |
+
"VectorizeMatMult": 0.04178977012634277,
|
| 774 |
+
"WeightCoalescing": 0.0037496089935302734,
|
| 775 |
+
"ZeroSizeTensorElimination": 0.00022602081298828125
|
| 776 |
+
},
|
| 777 |
+
"tensorizer": {
|
| 778 |
+
"DMATilingProfiler::TotalInstructionsAfterTiling": 12395.0,
|
| 779 |
+
"StaticProfiler::AifUb": 272.9356689453125,
|
| 780 |
+
"StaticProfiler::ArithmeticIntensityTensorizer": 394.9350280761719,
|
| 781 |
+
"StaticProfiler::AverageDmaLength": 1993.7806396484375,
|
| 782 |
+
"StaticProfiler::AverageFractalPeUtilization": 100.0,
|
| 783 |
+
"StaticProfiler::AveragePartitionUtilization": 99.59767150878906,
|
| 784 |
+
"StaticProfiler::AveragePeUtilization": 100.0,
|
| 785 |
+
"StaticProfiler::DDRTransferBytes": 139593728.0,
|
| 786 |
+
"StaticProfiler::InternalTransferBytes": 38535168.0,
|
| 787 |
+
"StaticProfiler::LoadExpanded": 49793.0,
|
| 788 |
+
"StaticProfiler::LocalizationEfficiency": 144.69894409179688,
|
| 789 |
+
"StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 179.81776428222656,
|
| 790 |
+
"StaticProfiler::StoreExpanded": 11265.0,
|
| 791 |
+
"StaticProfiler::TotalDMAExpanded": 61058.0,
|
| 792 |
+
"StaticProfiler::TotalDynamicInstancesCount": 4975.0,
|
| 793 |
+
"StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 4975.0,
|
| 794 |
+
"StaticProfiler::TotalLNCComm": 0.0,
|
| 795 |
+
"StaticProfiler::TotalLNCCommTransfer": 0.0,
|
| 796 |
+
"TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0,
|
| 797 |
+
"TilingProfiler::AveragePeUtilizationAfterTiling": 0.0,
|
| 798 |
+
"TilingProfiler::BatchnormInstructionsAfterTiling": 0.0,
|
| 799 |
+
"TilingProfiler::DmaInstructionsAfterTiling": 0.0,
|
| 800 |
+
"TilingProfiler::GenericInstructionsAfterTiling": 64.0,
|
| 801 |
+
"TilingProfiler::MatMultInstructionsAfterTiling": 3072.0,
|
| 802 |
+
"TilingProfiler::NumPfTransposes": 8.0,
|
| 803 |
+
"TilingProfiler::NumPfTransposesForIo": 3.0,
|
| 804 |
+
"TilingProfiler::NumPfTransposesForLocal": 3.0,
|
| 805 |
+
"TilingProfiler::NumPfTransposesForNonlocal": 2.0,
|
| 806 |
+
"TilingProfiler::PfTransposeInstructions": 496.0,
|
| 807 |
+
"TilingProfiler::PfTransposeInstructionsForIo": 144.0,
|
| 808 |
+
"TilingProfiler::PfTransposeInstructionsForLocal": 96.0,
|
| 809 |
+
"TilingProfiler::PfTransposeInstructionsForNonlocal": 256.0,
|
| 810 |
+
"TilingProfiler::ReduceInstructionsAfterTiling": 0.0,
|
| 811 |
+
"TilingProfiler::SimdInstructionsAfterTiling": 275.0,
|
| 812 |
+
"TilingProfiler::TotalInstructionsAfterTiling": 0.0,
|
| 813 |
+
"TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0,
|
| 814 |
+
"TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0,
|
| 815 |
+
"TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0,
|
| 816 |
+
"TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0,
|
| 817 |
+
"TransformConvOp::conv2d_column_packing": 0.0,
|
| 818 |
+
"TransformConvOp::conv2d_column_packing_1": 0.0,
|
| 819 |
+
"TransformConvOp::conv2d_column_packing_io10": 0.0,
|
| 820 |
+
"TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0
|
| 821 |
+
}
|
| 822 |
+
},
|
| 823 |
+
"sg0002": {
|
| 824 |
+
"compiletime": {
|
| 825 |
+
"AGOrderingAnalysisPass": 0.057534217834472656,
|
| 826 |
+
"AffinePredicateResolution": 0.0009605884552001953,
|
| 827 |
+
"AliasDependencyElimination": 0.00025153160095214844,
|
| 828 |
+
"AliasDependencyInduction": 0.006276607513427734,
|
| 829 |
+
"AliasDependencyReset": 0.027743816375732422,
|
| 830 |
+
"BFComputeCutting": 0.0031321048736572266,
|
| 831 |
+
"BirCodeGenLoop": 0.5169932842254639,
|
| 832 |
+
"CCOpFusion": 0.05496716499328613,
|
| 833 |
+
"CanonicalizeDAGForPGTiling": 0.010706663131713867,
|
| 834 |
+
"CanonicalizeIR": 0.00154876708984375,
|
| 835 |
+
"CoalesceCCOp": 0.020469188690185547,
|
| 836 |
+
"CommuteConcat": 0.001708984375,
|
| 837 |
+
"DMALocalityOpt": 0.0024063587188720703,
|
| 838 |
+
"DMAProfiler": 0.021881103515625,
|
| 839 |
+
"DMATilingProfiler": 0.011522531509399414,
|
| 840 |
+
"DataLocalityOpt": 0.28015780448913574,
|
| 841 |
+
"DataStreaming": 0.018134355545043945,
|
| 842 |
+
"DeConcat": 0.002462148666381836,
|
| 843 |
+
"DeadCodeElimination": 0.0021996498107910156,
|
| 844 |
+
"DeadStoreElimination": 0.007483243942260742,
|
| 845 |
+
"DelinearIndices": 0.008810281753540039,
|
| 846 |
+
"Delinearization": 0.009731292724609375,
|
| 847 |
+
"DelinearizeSPMD": 0.04425859451293945,
|
| 848 |
+
"DoNothing": 6.67572021484375e-05,
|
| 849 |
+
"DramToDramTranspose": 0.012907743453979492,
|
| 850 |
+
"DumpGraphAndMetadata": 0.07597684860229492,
|
| 851 |
+
"EliminateDivs": 0.0021903514862060547,
|
| 852 |
+
"ExpandBatchNorm": 0.001527547836303711,
|
| 853 |
+
"ExpandISAMacro": 0.015442609786987305,
|
| 854 |
+
"FactorizeBlkDims": 0.020684003829956055,
|
| 855 |
+
"FactorizeThreadAxesInFreeDims": 0.003031015396118164,
|
| 856 |
+
"FlattenMacroLoop": 0.004990577697753906,
|
| 857 |
+
"GenericAccessSimplifier": 0.0007598400115966797,
|
| 858 |
+
"InferInitValue": 0.10130023956298828,
|
| 859 |
+
"InferIntrinsicOnCC": 0.007919549942016602,
|
| 860 |
+
"InferNeuronTensor": 0.05837249755859375,
|
| 861 |
+
"InferNonlocalTensors": 0.05706453323364258,
|
| 862 |
+
"InferPSumTensor": 0.04483771324157715,
|
| 863 |
+
"InferShardAxis": 0.4604020118713379,
|
| 864 |
+
"InferSharedMemLoc": 0.04048299789428711,
|
| 865 |
+
"InlineNativeKernels": 0.006569623947143555,
|
| 866 |
+
"InsertCoreBarrier": 0.010969161987304688,
|
| 867 |
+
"InsertIOTransposes": 0.0684211254119873,
|
| 868 |
+
"InsertImplicitShardAxisBeforeISel": 0.01549673080444336,
|
| 869 |
+
"InsertLocalTransposes": 0.022176742553710938,
|
| 870 |
+
"InsertOffloadedTransposes": 0.0181121826171875,
|
| 871 |
+
"LICM": 0.007555484771728516,
|
| 872 |
+
"LateLegalizeInst": 0.013030767440795898,
|
| 873 |
+
"LateLegalizePostSplit": 0.01993083953857422,
|
| 874 |
+
"LateLowerReshapeOp": 0.0016782283782958984,
|
| 875 |
+
"LateLowerTensorOp": 0.0021178722381591797,
|
| 876 |
+
"LateNeuronInstComb": 0.03255581855773926,
|
| 877 |
+
"LayoutPreprocessing": 0.10170960426330566,
|
| 878 |
+
"LayoutPreprocessingAndAnalysis": 0.23344039916992188,
|
| 879 |
+
"LayoutRequirementAnalysis": 0.032952308654785156,
|
| 880 |
+
"LegalizeCCOpLayout": 0.002583742141723633,
|
| 881 |
+
"LegalizeOpLevelAlias": 0.002170562744140625,
|
| 882 |
+
"LegalizePartitionReduce": 0.0025551319122314453,
|
| 883 |
+
"LegalizeSundaAccess": 0.08088016510009766,
|
| 884 |
+
"LegalizeSundaMacro": 0.04086017608642578,
|
| 885 |
+
"LegalizeType": 0.009904623031616211,
|
| 886 |
+
"LocalLayoutOpt": 0.023218154907226563,
|
| 887 |
+
"LoopFusion": 0.005990266799926758,
|
| 888 |
+
"LoopSplitting": 0.0007989406585693359,
|
| 889 |
+
"LowerBroadcast": 0.0051610469818115234,
|
| 890 |
+
"LowerCCOpBlockAxis": 0.007201671600341797,
|
| 891 |
+
"LowerComplexBroadcast": 0.00890207290649414,
|
| 892 |
+
"LowerIntrinsics": 0.09793353080749512,
|
| 893 |
+
"LowerShardAxis": 0.023633956909179688,
|
| 894 |
+
"LowerTensorOp": 0.03027796745300293,
|
| 895 |
+
"LowerToSendRecv": 0.027859210968017578,
|
| 896 |
+
"LowerTranspose": 0.0216217041015625,
|
| 897 |
+
"MacroGeneration": 0.12761783599853516,
|
| 898 |
+
"MaskPropagation": 0.01400303840637207,
|
| 899 |
+
"MemcpyElimination": 0.03596854209899902,
|
| 900 |
+
"MutateDataType": 0.0020971298217773438,
|
| 901 |
+
"NeuronAliasDependencyInduction": 0.0019202232360839844,
|
| 902 |
+
"NeuronAliasDependencyReset": 0.027405738830566406,
|
| 903 |
+
"NeuronInstComb": 0.024044275283813477,
|
| 904 |
+
"NeuronLICM": 0.027622222900390625,
|
| 905 |
+
"NeuronLoopFusion": 0.06255030632019043,
|
| 906 |
+
"NeuronLoopInterchange": 0.002681255340576172,
|
| 907 |
+
"NeuronSimplifier": 0.01907205581665039,
|
| 908 |
+
"NeuronSimplifyPredicates": 0.029021024703979492,
|
| 909 |
+
"NeuronValueNumbering": 0.011119604110717773,
|
| 910 |
+
"OptimizeAliasedCopyChain": 0.0005273818969726563,
|
| 911 |
+
"OptimizeNKIKernels": 4.391921043395996,
|
| 912 |
+
"PAGLayoutOpt": 0.16190624237060547,
|
| 913 |
+
"PComputeCutting": 0.016373872756958008,
|
| 914 |
+
"PGLayoutTilingPipeline": 2.0541465282440186,
|
| 915 |
+
"PGTiling": 0.3632845878601074,
|
| 916 |
+
"PadElimination": 0.0006501674652099609,
|
| 917 |
+
"ParAxesAnnotation": 0.08851456642150879,
|
| 918 |
+
"PartialLoopFusion": 0.05034661293029785,
|
| 919 |
+
"PartialSimdFusion": 0.014182329177856445,
|
| 920 |
+
"PerfectLoopNest": 0.0036270618438720703,
|
| 921 |
+
"RecognizeOpIdiom": 0.007064342498779297,
|
| 922 |
+
"Recompute": 0.00046062469482421875,
|
| 923 |
+
"RelaxPredicates": 0.02269601821899414,
|
| 924 |
+
"Rematerialization": 0.0019779205322265625,
|
| 925 |
+
"RemoveShardedPartitionAxes": 0.014830350875854492,
|
| 926 |
+
"ReshapeWeights": 0.0021474361419677734,
|
| 927 |
+
"ResolveAccessConflict": 0.007428646087646484,
|
| 928 |
+
"ResolveComplicatePredicates": 0.001834869384765625,
|
| 929 |
+
"RewriteReplicationMatmul": 0.006201982498168945,
|
| 930 |
+
"RewriteWeights": 0.004793643951416016,
|
| 931 |
+
"SFKVectorizer": 0.2884867191314697,
|
| 932 |
+
"ShardingPropagationAnalysis": 0.2801475524902344,
|
| 933 |
+
"SimpleAllReduceTiling": 0.008132696151733398,
|
| 934 |
+
"Simplifier": 0.003251314163208008,
|
| 935 |
+
"SimplifyMacroPredicates": 0.03280019760131836,
|
| 936 |
+
"SimplifyNeuronTensor": 0.04464459419250488,
|
| 937 |
+
"SimplifySlice": 0.0008628368377685547,
|
| 938 |
+
"SimplifyTensor": 0.014911413192749023,
|
| 939 |
+
"SpillPSum": 0.03145956993103027,
|
| 940 |
+
"SplitAPUnionSets": 0.09714126586914063,
|
| 941 |
+
"SplitAccGrp": 0.006166219711303711,
|
| 942 |
+
"StaticProfiler": 0.021403789520263672,
|
| 943 |
+
"StaticTransposeLocalTensor": 0.02319931983947754,
|
| 944 |
+
"SundaISel": 0.07143282890319824,
|
| 945 |
+
"TCTransform": 0.001344442367553711,
|
| 946 |
+
"TensorInitialization": 0.020877599716186523,
|
| 947 |
+
"TensorOpSimplifier": 0.0060787200927734375,
|
| 948 |
+
"TensorOpTransform": 0.03784608840942383,
|
| 949 |
+
"TileCCOps": 0.005100250244140625,
|
| 950 |
+
"TilingProfiler": 0.02941441535949707,
|
| 951 |
+
"TransformConvOp": 0.005896091461181641,
|
| 952 |
+
"TritiumFusion": 0.08978962898254395,
|
| 953 |
+
"ValueNumbering": 0.0032432079315185547,
|
| 954 |
+
"VectorizeDMA": 0.005987644195556641,
|
| 955 |
+
"VectorizeMatMult": 0.019278526306152344,
|
| 956 |
+
"WeightCoalescing": 0.004654884338378906,
|
| 957 |
+
"ZeroSizeTensorElimination": 0.00021028518676757813
|
| 958 |
+
},
|
| 959 |
+
"tensorizer": {
|
| 960 |
+
"DMATilingProfiler::TotalInstructionsAfterTiling": 22664.0,
|
| 961 |
+
"StaticProfiler::AifUb": 229.36119079589844,
|
| 962 |
+
"StaticProfiler::ArithmeticIntensityTensorizer": 194.92408752441406,
|
| 963 |
+
"StaticProfiler::AverageDmaLength": 2258.685546875,
|
| 964 |
+
"StaticProfiler::AverageFractalPeUtilization": 98.80319213867188,
|
| 965 |
+
"StaticProfiler::AveragePartitionUtilization": 94.51075744628906,
|
| 966 |
+
"StaticProfiler::AveragePeUtilization": 96.83863067626953,
|
| 967 |
+
"StaticProfiler::DDRTransferBytes": 420482080.0,
|
| 968 |
+
"StaticProfiler::InternalTransferBytes": 338614048.0,
|
| 969 |
+
"StaticProfiler::LoadExpanded": 118366.0,
|
| 970 |
+
"StaticProfiler::LocalizationEfficiency": 84.98564147949219,
|
| 971 |
+
"StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 89.59233093261719,
|
| 972 |
+
"StaticProfiler::StoreExpanded": 4458.0,
|
| 973 |
+
"StaticProfiler::TotalDMAExpanded": 122824.0,
|
| 974 |
+
"StaticProfiler::TotalDynamicInstancesCount": 27423.0,
|
| 975 |
+
"StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 26972.0,
|
| 976 |
+
"StaticProfiler::TotalLNCComm": 0.0,
|
| 977 |
+
"StaticProfiler::TotalLNCCommTransfer": 0.0,
|
| 978 |
+
"TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0,
|
| 979 |
+
"TilingProfiler::AveragePeUtilizationAfterTiling": 0.0,
|
| 980 |
+
"TilingProfiler::BatchnormInstructionsAfterTiling": 0.0,
|
| 981 |
+
"TilingProfiler::DmaInstructionsAfterTiling": 0.0,
|
| 982 |
+
"TilingProfiler::GenericInstructionsAfterTiling": 4.0,
|
| 983 |
+
"TilingProfiler::MatMultInstructionsAfterTiling": 11808.0,
|
| 984 |
+
"TilingProfiler::NumPfTransposes": 5.0,
|
| 985 |
+
"TilingProfiler::NumPfTransposesForIo": 1.0,
|
| 986 |
+
"TilingProfiler::NumPfTransposesForLocal": 1.0,
|
| 987 |
+
"TilingProfiler::NumPfTransposesForNonlocal": 3.0,
|
| 988 |
+
"TilingProfiler::PfTransposeInstructions": 9889.0,
|
| 989 |
+
"TilingProfiler::PfTransposeInstructionsForIo": 9504.0,
|
| 990 |
+
"TilingProfiler::PfTransposeInstructionsForLocal": 1.0,
|
| 991 |
+
"TilingProfiler::PfTransposeInstructionsForNonlocal": 384.0,
|
| 992 |
+
"TilingProfiler::ReduceInstructionsAfterTiling": 6.0,
|
| 993 |
+
"TilingProfiler::SimdInstructionsAfterTiling": 165.0,
|
| 994 |
+
"TilingProfiler::TotalInstructionsAfterTiling": 0.0,
|
| 995 |
+
"TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0,
|
| 996 |
+
"TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0,
|
| 997 |
+
"TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0,
|
| 998 |
+
"TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0,
|
| 999 |
+
"TransformConvOp::conv2d_column_packing": 0.0,
|
| 1000 |
+
"TransformConvOp::conv2d_column_packing_1": 0.0,
|
| 1001 |
+
"TransformConvOp::conv2d_column_packing_io10": 0.0,
|
| 1002 |
+
"TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0
|
| 1003 |
+
}
|
| 1004 |
+
},
|
| 1005 |
+
"sg01": {
|
| 1006 |
+
"compiletime": {
|
| 1007 |
+
"CanonicalizeConv": 7.000000096013537e-06,
|
| 1008 |
+
"CanonicalizeForTensorizer": 1.1000000085914508e-05,
|
| 1009 |
+
"Canonicalizer": 0.00023700000019744039,
|
| 1010 |
+
"HoistCompute": 4.999999873689376e-06,
|
| 1011 |
+
"IdentifyCrossPassTensors": 1.2999999853491317e-05,
|
| 1012 |
+
"MemcastMotion": 7.999999979801942e-06,
|
| 1013 |
+
"PenguinizeFunctions": 1.2000000424450263e-05,
|
| 1014 |
+
"PruneFunctions": 1.700000029813964e-05,
|
| 1015 |
+
"RemoveOptimizationBarriers": 2.300000051036477e-05,
|
| 1016 |
+
"ScatterMotion": 1.700000029813964e-05,
|
| 1017 |
+
"TensorizerLegalizationPass": 1.5999999959603883e-05,
|
| 1018 |
+
"VerifySupportedOps": 9.999999747378752e-06,
|
| 1019 |
+
"algsimp": 4.70000013592653e-05,
|
| 1020 |
+
"batchnorm_expander": 1.2000000424450263e-05,
|
| 1021 |
+
"boundary-marker-removal": 4.999999873689376e-06,
|
| 1022 |
+
"call-inliner": 7.999999979801942e-06,
|
| 1023 |
+
"canonicalize-boundary-marker": 6.000000212225132e-06,
|
| 1024 |
+
"collective-stream-id-checker": 3.000000106112566e-06,
|
| 1025 |
+
"comparison-expander": 7.000000096013537e-06,
|
| 1026 |
+
"computation-deduplicator": 1.8999999156221747e-05,
|
| 1027 |
+
"config-lowering": 3.7000001611886546e-05,
|
| 1028 |
+
"constant_folding": 7.000000096013537e-06,
|
| 1029 |
+
"cse": 9.999999747378752e-06,
|
| 1030 |
+
"dce": 9.999999974752427e-07,
|
| 1031 |
+
"dynamic-slice-transpose": 3.999999989900971e-06,
|
| 1032 |
+
"eliminate-redundant-compare": 3.000000106112566e-06,
|
| 1033 |
+
"emit-offloaded-dropout": 1.2000000424450263e-05,
|
| 1034 |
+
"flatten-call-graph": 7.000000096013537e-06,
|
| 1035 |
+
"fuse-send-recv": 1.8999999156221747e-05,
|
| 1036 |
+
"hilo-conditional-to-select": 3.999999989900971e-06,
|
| 1037 |
+
"hilo::LegalizeAlias": 3.999999989900971e-06,
|
| 1038 |
+
"hilo::NeuronInstCombine": 5.0999999075429514e-05,
|
| 1039 |
+
"hilo::NeuronOpFusion": 1.700000029813964e-05,
|
| 1040 |
+
"hilo::ReplaceTokenTypeWithU8Pass": 1.2999999853491317e-05,
|
| 1041 |
+
"hilo::ScheduleFusion": 1.9999999949504854e-06,
|
| 1042 |
+
"hilo::SixtyFourHack": 1.2999999853491317e-05,
|
| 1043 |
+
"hilo::VerifyAliasing": 9.999999974752427e-07,
|
| 1044 |
+
"hlo-mac-count": 8.199999865610152e-05,
|
| 1045 |
+
"legalize-ccops-for-tensorizer": 9.999999974752427e-07,
|
| 1046 |
+
"legalize-compare": 3.999999989900971e-06,
|
| 1047 |
+
"lower-argminmax-custom-call": 3.999999989900971e-06,
|
| 1048 |
+
"map-inline": 1.1000000085914508e-05,
|
| 1049 |
+
"metadata-naming": 1.8000000636675395e-05,
|
| 1050 |
+
"mlir::detail::OpToOpPassAdaptor": 2.5999999706982635e-05,
|
| 1051 |
+
"mlir::hlo::MhloToPyPenguin": 0.0009560000034980476,
|
| 1052 |
+
"mlir::mhlo::LowerComplexExtraPass": 7.000000186963007e-05,
|
| 1053 |
+
"mlir::mhlo::LowerComplexPass": 0.00014000000373926014,
|
| 1054 |
+
"native-to-custom-softmax": 4.999999873689376e-06,
|
| 1055 |
+
"native-to-custom-softmax-dx": 1.8000000636675395e-05,
|
| 1056 |
+
"neuron-hlo-verifier": 0.0003600000054575503,
|
| 1057 |
+
"operand_upcaster": 1.4999999621068127e-05,
|
| 1058 |
+
"post-par-pipe-begin": 9.999999974752427e-07,
|
| 1059 |
+
"post-par-pipe-end": 0.0,
|
| 1060 |
+
"post-partition-simplification": 0.0004780000017490238,
|
| 1061 |
+
"replace-minimum-constant": 3.999999989900971e-06,
|
| 1062 |
+
"reshape-mover": 1.9999999949504854e-06,
|
| 1063 |
+
"simplify-concat": 3.9999998989515007e-05,
|
| 1064 |
+
"simplify-while-loops": 1.9999999949504854e-06,
|
| 1065 |
+
"transform-variadic-reduce": 9.000000318337698e-06,
|
| 1066 |
+
"tuple-simplifier": 3.999999989900971e-06,
|
| 1067 |
+
"unpack-nested-aws-ntwsr": 3.999999989900971e-06,
|
| 1068 |
+
"unroll-while-loop": 0.0
|
| 1069 |
+
},
|
| 1070 |
+
"hilo": {
|
| 1071 |
+
"ArithmeticIntensity": 374.9828186035156,
|
| 1072 |
+
"HloMacCount": 26843545600.0,
|
| 1073 |
+
"Traffic": 143172128.0
|
| 1074 |
+
}
|
| 1075 |
+
},
|
| 1076 |
+
"sg02": {
|
| 1077 |
+
"compiletime": {
|
| 1078 |
+
"CanonicalizeConv": 1.1000000085914508e-05,
|
| 1079 |
+
"CanonicalizeForTensorizer": 1.2000000424450263e-05,
|
| 1080 |
+
"Canonicalizer": 0.0002899999963119626,
|
| 1081 |
+
"HoistCompute": 3.999999989900971e-06,
|
| 1082 |
+
"IdentifyCrossPassTensors": 2.300000051036477e-05,
|
| 1083 |
+
"MemcastMotion": 1.1000000085914508e-05,
|
| 1084 |
+
"PenguinizeFunctions": 1.2000000424450263e-05,
|
| 1085 |
+
"PruneFunctions": 7.999999979801942e-06,
|
| 1086 |
+
"RemoveOptimizationBarriers": 1.4000000192027073e-05,
|
| 1087 |
+
"ScatterMotion": 3.000000106112566e-06,
|
| 1088 |
+
"TensorizerLegalizationPass": 7.000000096013537e-06,
|
| 1089 |
+
"VerifySupportedOps": 1.2000000424450263e-05,
|
| 1090 |
+
"algsimp": 5.6000000768108293e-05,
|
| 1091 |
+
"batchnorm_expander": 9.999999747378752e-06,
|
| 1092 |
+
"boundary-marker-removal": 3.999999989900971e-06,
|
| 1093 |
+
"call-inliner": 9.999999747378752e-06,
|
| 1094 |
+
"canonicalize-boundary-marker": 4.999999873689376e-06,
|
| 1095 |
+
"collective-stream-id-checker": 3.000000106112566e-06,
|
| 1096 |
+
"comparison-expander": 4.999999873689376e-06,
|
| 1097 |
+
"computation-deduplicator": 2.099999983329326e-05,
|
| 1098 |
+
"config-lowering": 4.400000034365803e-05,
|
| 1099 |
+
"constant_folding": 7.999999979801942e-06,
|
| 1100 |
+
"cse": 1.4000000192027073e-05,
|
| 1101 |
+
"dce": 9.999999974752427e-07,
|
| 1102 |
+
"dynamic-slice-transpose": 3.999999989900971e-06,
|
| 1103 |
+
"eliminate-redundant-compare": 3.000000106112566e-06,
|
| 1104 |
+
"emit-offloaded-dropout": 1.2999999853491317e-05,
|
| 1105 |
+
"flatten-call-graph": 1.1000000085914508e-05,
|
| 1106 |
+
"fuse-send-recv": 1.8999999156221747e-05,
|
| 1107 |
+
"hilo-conditional-to-select": 4.999999873689376e-06,
|
| 1108 |
+
"hilo::LegalizeAlias": 1.9999999949504854e-06,
|
| 1109 |
+
"hilo::NeuronInstCombine": 6.900000153109431e-05,
|
| 1110 |
+
"hilo::NeuronOpFusion": 6.000000212225132e-06,
|
| 1111 |
+
"hilo::ReplaceTokenTypeWithU8Pass": 1.5999999959603883e-05,
|
| 1112 |
+
"hilo::ScheduleFusion": 9.999999974752427e-07,
|
| 1113 |
+
"hilo::SixtyFourHack": 4.400000034365803e-05,
|
| 1114 |
+
"hilo::VerifyAliasing": 9.999999974752427e-07,
|
| 1115 |
+
"hlo-mac-count": 0.004767000209540129,
|
| 1116 |
+
"legalize-ccops-for-tensorizer": 9.999999974752427e-07,
|
| 1117 |
+
"legalize-compare": 3.000000106112566e-06,
|
| 1118 |
+
"lower-argminmax-custom-call": 3.000000106112566e-06,
|
| 1119 |
+
"map-inline": 1.1000000085914508e-05,
|
| 1120 |
+
"metadata-naming": 1.700000029813964e-05,
|
| 1121 |
+
"mlir::detail::OpToOpPassAdaptor": 2.9000000722589903e-05,
|
| 1122 |
+
"mlir::hlo::MhloToPyPenguin": 0.006047999951988459,
|
| 1123 |
+
"mlir::mhlo::LowerComplexExtraPass": 7.599999662488699e-05,
|
| 1124 |
+
"mlir::mhlo::LowerComplexPass": 9.699999645818025e-05,
|
| 1125 |
+
"native-to-custom-softmax": 6.000000212225132e-06,
|
| 1126 |
+
"native-to-custom-softmax-dx": 2.2000000171829015e-05,
|
| 1127 |
+
"neuron-hlo-verifier": 0.0003600000054575503,
|
| 1128 |
+
"operand_upcaster": 1.700000029813964e-05,
|
| 1129 |
+
"post-par-pipe-begin": 3.000000106112566e-06,
|
| 1130 |
+
"post-par-pipe-end": 0.0,
|
| 1131 |
+
"post-partition-simplification": 0.0005520000122487545,
|
| 1132 |
+
"replace-minimum-constant": 9.000000318337698e-06,
|
| 1133 |
+
"reshape-mover": 3.000000106112566e-06,
|
| 1134 |
+
"simplify-concat": 4.199999966658652e-05,
|
| 1135 |
+
"simplify-while-loops": 1.9999999949504854e-06,
|
| 1136 |
+
"transform-variadic-reduce": 4.70000013592653e-05,
|
| 1137 |
+
"tuple-simplifier": 4.999999873689376e-06,
|
| 1138 |
+
"unpack-nested-aws-ntwsr": 3.999999989900971e-06,
|
| 1139 |
+
"unroll-while-loop": 0.0
|
| 1140 |
+
},
|
| 1141 |
+
"hilo": {
|
| 1142 |
+
"ArithmeticIntensity": 107.69713592529297,
|
| 1143 |
+
"HloMacCount": 19483983872.0,
|
| 1144 |
+
"Traffic": 361829184.0
|
| 1145 |
+
}
|
| 1146 |
+
},
|
| 1147 |
+
"topk": {
|
| 1148 |
+
"compiletime": {
|
| 1149 |
+
"CoalesceCCOp": 0.0069692134857177734,
|
| 1150 |
+
"DMALocalityOpt": 0.006772279739379883,
|
| 1151 |
+
"DMAProfiler": 0.008215665817260742,
|
| 1152 |
+
"DataStreaming": 0.012622594833374023,
|
| 1153 |
+
"DoNothing": 0.004723310470581055,
|
| 1154 |
+
"ExpandISAMacro": 0.007757902145385742,
|
| 1155 |
+
"FactorizeBlkDims": 0.030848026275634766,
|
| 1156 |
+
"InferPSumTensor": 0.023444652557373047,
|
| 1157 |
+
"InferSharedMemLoc": 0.010675668716430664,
|
| 1158 |
+
"InsertCoreBarrier": 0.007489681243896484,
|
| 1159 |
+
"LateLegalizeInst": 0.01503753662109375,
|
| 1160 |
+
"LateNeuronInstComb": 0.017124652862548828,
|
| 1161 |
+
"LegalizeSundaAccess": 0.028142213821411133,
|
| 1162 |
+
"LegalizeType": 0.02222132682800293,
|
| 1163 |
+
"LowerBroadcast": 0.006150484085083008,
|
| 1164 |
+
"LowerIntrinsics": 0.00729680061340332,
|
| 1165 |
+
"LowerTranspose": 0.006754398345947266,
|
| 1166 |
+
"NeuronInstComb": 0.016539335250854492,
|
| 1167 |
+
"NeuronLICM": 0.024366140365600586,
|
| 1168 |
+
"NeuronSimplifyPredicates": 0.006876230239868164,
|
| 1169 |
+
"NeuronValueNumbering": 0.007918596267700195,
|
| 1170 |
+
"SFKVectorizer": 0.11957359313964844,
|
| 1171 |
+
"SimpleAllReduceTiling": 0.016579627990722656,
|
| 1172 |
+
"SimplifyNeuronTensor": 0.10249876976013184,
|
| 1173 |
+
"SpillPSum": 0.03416609764099121,
|
| 1174 |
+
"WeightCoalescing": 0.009296655654907227
|
| 1175 |
+
}
|
| 1176 |
+
}
|
| 1177 |
+
}
|
context_encoding_model/_tp0_bk3/graph.neff
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:bdea9302d0f9d0785d148992ac29a3b377a867a1a9ce89c40e3ccad020e4ef73
|
| 3 |
+
size 1506304
|
context_encoding_model/_tp0_bk3/log-neuron-cc.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
context_encoding_model/_tp0_bk3/metaneff.pb
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1a788ec9ea41bfa0696307ae7b82f6644a908b0b0a1feb7f30da3ca4349d0c13
|
| 3 |
+
size 2955932
|
context_encoding_model/_tp0_bk3/model.MODULE_be035899334776123ed5+d208bdce.hlo_module.pb
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:616f0c948889cd427dac21bbe629a046747b018871cf2815b3477d1f3d54d269
|
| 3 |
+
size 3042718
|
context_encoding_model/_tp0_bk3/model.MODULE_be035899334776123ed5+d208bdce.neff
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:bdea9302d0f9d0785d148992ac29a3b377a867a1a9ce89c40e3ccad020e4ef73
|
| 3 |
+
size 1506304
|
context_encoding_model/_tp0_bk3/neuron_config.json
ADDED
|
@@ -0,0 +1,224 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_attn_implementation_autoset": false,
|
| 3 |
+
"_name_or_path": "/home/ubuntu/models/Qwen3-1.7B",
|
| 4 |
+
"add_cross_attention": false,
|
| 5 |
+
"architectures": [
|
| 6 |
+
"Qwen3ForCausalLM"
|
| 7 |
+
],
|
| 8 |
+
"attention_bias": false,
|
| 9 |
+
"attention_dropout": 0.0,
|
| 10 |
+
"attribute_map": {},
|
| 11 |
+
"bad_words_ids": null,
|
| 12 |
+
"begin_suppress_tokens": null,
|
| 13 |
+
"bos_token_id": 151643,
|
| 14 |
+
"chunk_size_feed_forward": 0,
|
| 15 |
+
"cross_attention_hidden_size": null,
|
| 16 |
+
"decoder_start_token_id": null,
|
| 17 |
+
"diversity_penalty": 0.0,
|
| 18 |
+
"do_sample": false,
|
| 19 |
+
"early_stopping": false,
|
| 20 |
+
"encoder_no_repeat_ngram_size": 0,
|
| 21 |
+
"eos_token_id": 151645,
|
| 22 |
+
"exponential_decay_length_penalty": null,
|
| 23 |
+
"finetuning_task": null,
|
| 24 |
+
"forced_bos_token_id": null,
|
| 25 |
+
"forced_eos_token_id": null,
|
| 26 |
+
"fused_spec_config": null,
|
| 27 |
+
"head_dim": 128,
|
| 28 |
+
"hidden_act": "silu",
|
| 29 |
+
"hidden_size": 2048,
|
| 30 |
+
"id2label": {
|
| 31 |
+
"0": "LABEL_0",
|
| 32 |
+
"1": "LABEL_1"
|
| 33 |
+
},
|
| 34 |
+
"initializer_range": 0.02,
|
| 35 |
+
"intermediate_size": 6144,
|
| 36 |
+
"is_decoder": false,
|
| 37 |
+
"is_encoder_decoder": false,
|
| 38 |
+
"label2id": {
|
| 39 |
+
"LABEL_0": 0,
|
| 40 |
+
"LABEL_1": 1
|
| 41 |
+
},
|
| 42 |
+
"length_penalty": 1.0,
|
| 43 |
+
"max_length": 20,
|
| 44 |
+
"max_position_embeddings": 40960,
|
| 45 |
+
"max_window_layers": 28,
|
| 46 |
+
"metadata": null,
|
| 47 |
+
"min_length": 0,
|
| 48 |
+
"model_type": "qwen3",
|
| 49 |
+
"neuron_config": {
|
| 50 |
+
"activation_quantization_type": null,
|
| 51 |
+
"allow_input_truncation": false,
|
| 52 |
+
"apply_seq_ids_mask": false,
|
| 53 |
+
"async_mode": false,
|
| 54 |
+
"attention_dp_degree": 1,
|
| 55 |
+
"attention_dtype": null,
|
| 56 |
+
"attn_block_cte_nki_kernel_enabled": false,
|
| 57 |
+
"attn_block_tkg_nki_kernel_cache_update": false,
|
| 58 |
+
"attn_block_tkg_nki_kernel_cascaded_attention": false,
|
| 59 |
+
"attn_block_tkg_nki_kernel_enabled": false,
|
| 60 |
+
"attn_cls": {
|
| 61 |
+
"__module__": "neuronx_distributed_inference.models.qwen3.modeling_qwen3",
|
| 62 |
+
"__name__": "NeuronQwen3Attention"
|
| 63 |
+
},
|
| 64 |
+
"attn_kernel_enabled": null,
|
| 65 |
+
"attn_tkg_builtin_kernel_enabled": false,
|
| 66 |
+
"attn_tkg_nki_kernel_enabled": false,
|
| 67 |
+
"batch_size": 1,
|
| 68 |
+
"bucket_n_active_tokens": true,
|
| 69 |
+
"buckets": [
|
| 70 |
+
1024
|
| 71 |
+
],
|
| 72 |
+
"cast_type": "config",
|
| 73 |
+
"cc_pipeline_tiling_factor": 2,
|
| 74 |
+
"chunked_prefill_config": null,
|
| 75 |
+
"context_encoding_buckets": [
|
| 76 |
+
1024
|
| 77 |
+
],
|
| 78 |
+
"cp_degree": 1,
|
| 79 |
+
"ctx_batch_size": 1,
|
| 80 |
+
"disable_kv_cache_tiling": false,
|
| 81 |
+
"draft_model_modules_to_not_convert": null,
|
| 82 |
+
"enable_bucketing": true,
|
| 83 |
+
"enable_cte_modular_flow": false,
|
| 84 |
+
"enable_eagle_draft_input_norm": false,
|
| 85 |
+
"enable_eagle_speculation": false,
|
| 86 |
+
"enable_fused_speculation": false,
|
| 87 |
+
"enable_long_context_mode": false,
|
| 88 |
+
"enable_output_completion_notifications": false,
|
| 89 |
+
"enable_spill_reload_dge": false,
|
| 90 |
+
"enable_token_tree": false,
|
| 91 |
+
"ep_degree": 1,
|
| 92 |
+
"expert_mlp_nki_kernel_enabled": null,
|
| 93 |
+
"flash_decoding_enabled": false,
|
| 94 |
+
"fused_qkv": false,
|
| 95 |
+
"fused_rmsnorm_skip_gamma": false,
|
| 96 |
+
"is_block_kv_layout": null,
|
| 97 |
+
"is_chunked_prefill": false,
|
| 98 |
+
"is_continuous_batching": true,
|
| 99 |
+
"is_eagle_draft": false,
|
| 100 |
+
"is_medusa": false,
|
| 101 |
+
"is_prefill_stage": true,
|
| 102 |
+
"is_prefix_caching": false,
|
| 103 |
+
"k_cache_transposed": false,
|
| 104 |
+
"kv_cache_batch_size": 8,
|
| 105 |
+
"kv_cache_padding_size": 0,
|
| 106 |
+
"kv_cache_quant": false,
|
| 107 |
+
"kv_cache_tiling": false,
|
| 108 |
+
"layer_boundary_markers": false,
|
| 109 |
+
"lm_head_pad": true,
|
| 110 |
+
"lm_head_pad_alignment_size": 1,
|
| 111 |
+
"local_ranks_size": 2,
|
| 112 |
+
"logical_nc_config": 2,
|
| 113 |
+
"lora_config": null,
|
| 114 |
+
"max_batch_size": 8,
|
| 115 |
+
"max_context_length": 4096,
|
| 116 |
+
"max_length": 4096,
|
| 117 |
+
"max_new_tokens": null,
|
| 118 |
+
"medusa_speculation_length": 0,
|
| 119 |
+
"medusa_tree": null,
|
| 120 |
+
"mlp_kernel_enabled": false,
|
| 121 |
+
"mlp_kernel_fuse_residual_add": false,
|
| 122 |
+
"modules_to_not_convert": null,
|
| 123 |
+
"moe_fused_nki_kernel_enabled": null,
|
| 124 |
+
"n_active_tokens": 4096,
|
| 125 |
+
"n_positions": 4096,
|
| 126 |
+
"num_medusa_heads": 0,
|
| 127 |
+
"on_cpu": false,
|
| 128 |
+
"on_device_sampling_config": {
|
| 129 |
+
"deterministic": false,
|
| 130 |
+
"do_sample": false,
|
| 131 |
+
"dynamic": true,
|
| 132 |
+
"global_topk": 256,
|
| 133 |
+
"on_device_sampling_config": true,
|
| 134 |
+
"temperature": 1.0,
|
| 135 |
+
"top_k": 1,
|
| 136 |
+
"top_k_kernel_enabled": false,
|
| 137 |
+
"top_p": 1.0
|
| 138 |
+
},
|
| 139 |
+
"output_logits": false,
|
| 140 |
+
"overrides_torch_dtype": true,
|
| 141 |
+
"pa_block_size": 4096,
|
| 142 |
+
"pa_num_blocks": 8,
|
| 143 |
+
"padding_side": "right",
|
| 144 |
+
"pp_degree": 1,
|
| 145 |
+
"prefix_buckets": null,
|
| 146 |
+
"qk_layernorm": false,
|
| 147 |
+
"qkv_kernel_enabled": false,
|
| 148 |
+
"qkv_kernel_fuse_residual_add": false,
|
| 149 |
+
"qkv_kernel_nbsd_layout": false,
|
| 150 |
+
"quantization_dtype": "int8",
|
| 151 |
+
"quantization_type": "per_tensor_symmetric",
|
| 152 |
+
"quantize_clamp_bound": Infinity,
|
| 153 |
+
"quantized": false,
|
| 154 |
+
"quantized_checkpoints_path": null,
|
| 155 |
+
"quantized_mlp_kernel_enabled": false,
|
| 156 |
+
"rmsnorm_quantize_kernel_enabled": false,
|
| 157 |
+
"router_topk_nki_kernel_enabled": null,
|
| 158 |
+
"rpl_reduce_dtype": null,
|
| 159 |
+
"save_sharded_checkpoint": true,
|
| 160 |
+
"scratchpad_page_size": null,
|
| 161 |
+
"seq_len": 4096,
|
| 162 |
+
"seq_len_threshold_for_cc_tiling": 16384,
|
| 163 |
+
"sequence_parallel_enabled": false,
|
| 164 |
+
"shared_mlp_nki_kernel_enabled": null,
|
| 165 |
+
"skip_sharding": false,
|
| 166 |
+
"skip_warmup": false,
|
| 167 |
+
"spec_batch_size": 8,
|
| 168 |
+
"speculation_length": 0,
|
| 169 |
+
"start_rank_id": 0,
|
| 170 |
+
"strided_context_parallel_kernel_enabled": false,
|
| 171 |
+
"target": null,
|
| 172 |
+
"tensor_capture_config": null,
|
| 173 |
+
"tile_cc": false,
|
| 174 |
+
"tkg_batch_size": 8,
|
| 175 |
+
"token_generation_buckets": null,
|
| 176 |
+
"token_tree_config": null,
|
| 177 |
+
"torch_dtype": "bfloat16",
|
| 178 |
+
"tp_degree": 2,
|
| 179 |
+
"vocab_parallel": false,
|
| 180 |
+
"weight_gather_seq_len_threshold": 32768,
|
| 181 |
+
"weights_to_skip_layout_optimization": [],
|
| 182 |
+
"world_size": 2
|
| 183 |
+
},
|
| 184 |
+
"no_repeat_ngram_size": 0,
|
| 185 |
+
"num_attention_heads": 16,
|
| 186 |
+
"num_beam_groups": 1,
|
| 187 |
+
"num_beams": 1,
|
| 188 |
+
"num_cores_per_group": 1,
|
| 189 |
+
"num_hidden_layers": 28,
|
| 190 |
+
"num_key_value_heads": 8,
|
| 191 |
+
"num_return_sequences": 1,
|
| 192 |
+
"output_attentions": false,
|
| 193 |
+
"output_hidden_states": false,
|
| 194 |
+
"output_scores": false,
|
| 195 |
+
"pad_token_id": 0,
|
| 196 |
+
"prefix": null,
|
| 197 |
+
"problem_type": null,
|
| 198 |
+
"pruned_heads": {},
|
| 199 |
+
"remove_invalid_values": false,
|
| 200 |
+
"repetition_penalty": 1.0,
|
| 201 |
+
"return_dict": true,
|
| 202 |
+
"return_dict_in_generate": false,
|
| 203 |
+
"rms_norm_eps": 1e-06,
|
| 204 |
+
"rope_scaling": null,
|
| 205 |
+
"rope_theta": 1000000,
|
| 206 |
+
"sep_token_id": null,
|
| 207 |
+
"sliding_window": null,
|
| 208 |
+
"suppress_tokens": null,
|
| 209 |
+
"task_specific_params": null,
|
| 210 |
+
"temperature": 1.0,
|
| 211 |
+
"tf_legacy_loss": false,
|
| 212 |
+
"tie_encoder_decoder": false,
|
| 213 |
+
"tie_word_embeddings": true,
|
| 214 |
+
"tokenizer_class": null,
|
| 215 |
+
"top_k": 50,
|
| 216 |
+
"top_p": 1.0,
|
| 217 |
+
"torchscript": false,
|
| 218 |
+
"transformers_version": "4.51.0",
|
| 219 |
+
"typical_p": 1.0,
|
| 220 |
+
"use_bfloat16": false,
|
| 221 |
+
"use_cache": true,
|
| 222 |
+
"use_sliding_window": false,
|
| 223 |
+
"vocab_size": 151936
|
| 224 |
+
}
|
context_encoding_model/_tp0_bk4/command.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
neuronx-cc compile --framework=XLA model.MODULE_95ef7ca73cc0a6161be2+96be3c33.hlo_module.pb --output model.MODULE_95ef7ca73cc0a6161be2+96be3c33.neff --target=trn2 --auto-cast=none --model-type=transformer '--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma ' --lnc=2 -O1 '--internal-hlo2tensorizer-options= --modular-flow-mac-threshold=10 --verify-hlo=true' --logfile=log-neuron-cc.txt --verbose=35
|
context_encoding_model/_tp0_bk4/compile_flags.MODULE_95ef7ca73cc0a6161be2+96be3c33.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
["--target=trn2", "--auto-cast=none", "--model-type=transformer", "--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma ", "--lnc=2", "-O1", "--internal-hlo2tensorizer-options= --modular-flow-mac-threshold=10 --verify-hlo=true", "--logfile=/home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk4/log-neuron-cc.txt"]
|
context_encoding_model/_tp0_bk4/global_metric_store.json
ADDED
|
@@ -0,0 +1,1177 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"Average": {
|
| 3 |
+
"tensorizer": {
|
| 4 |
+
"StaticProfiler::AverageFractalPeUtilization": 98.93502807617188,
|
| 5 |
+
"StaticProfiler::AveragePartitionUtilization": 95.0970230102539,
|
| 6 |
+
"StaticProfiler::AveragePeUtilization": 97.18069458007813,
|
| 7 |
+
"StaticProfiler::LocalizationEfficiency": 73.73954010009766,
|
| 8 |
+
"StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 79.92718505859375,
|
| 9 |
+
"TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0,
|
| 10 |
+
"TilingProfiler::AveragePeUtilizationAfterTiling": 0.0
|
| 11 |
+
}
|
| 12 |
+
},
|
| 13 |
+
"Count": {
|
| 14 |
+
"tensorizer": {
|
| 15 |
+
"StaticProfiler::AverageFractalPeUtilization": 1.0,
|
| 16 |
+
"StaticProfiler::AveragePartitionUtilization": 1.0,
|
| 17 |
+
"StaticProfiler::AveragePeUtilization": 1.0,
|
| 18 |
+
"StaticProfiler::LocalizationEfficiency": 1.0,
|
| 19 |
+
"StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 1.0,
|
| 20 |
+
"TilingProfiler::AveragePartitionUtilizationAfterTiling": 1.0,
|
| 21 |
+
"TilingProfiler::AveragePeUtilizationAfterTiling": 1.0
|
| 22 |
+
}
|
| 23 |
+
},
|
| 24 |
+
"Sum": {
|
| 25 |
+
"compiletime": {
|
| 26 |
+
"AGOrderingAnalysisPass": 0.04760026931762695,
|
| 27 |
+
"AffinePredicateResolution": 0.003319978713989258,
|
| 28 |
+
"AliasDependencyElimination": 0.0002167224884033203,
|
| 29 |
+
"AliasDependencyInduction": 0.008548259735107422,
|
| 30 |
+
"AliasDependencyReset": 0.03149843215942383,
|
| 31 |
+
"BFComputeCutting": 0.00810694694519043,
|
| 32 |
+
"BirCodeGenLoop": 0.2911098003387451,
|
| 33 |
+
"CCOpFusion": 0.08548593521118164,
|
| 34 |
+
"CanonicalizeConv": 2.7000001864507794e-05,
|
| 35 |
+
"CanonicalizeDAGForPGTiling": 0.007600545883178711,
|
| 36 |
+
"CanonicalizeForTensorizer": 5.699999746866524e-05,
|
| 37 |
+
"CanonicalizeIR": 0.0030400753021240234,
|
| 38 |
+
"Canonicalizer": 0.0011950000189244747,
|
| 39 |
+
"CoalesceCCOp": 0.020453453063964844,
|
| 40 |
+
"CommuteConcat": 0.007961034774780273,
|
| 41 |
+
"DMALocalityOpt": 0.016626834869384766,
|
| 42 |
+
"DMAProfiler": 0.018386363983154297,
|
| 43 |
+
"DMATilingProfiler": 0.009016752243041992,
|
| 44 |
+
"DataLocalityOpt": 0.17029356956481934,
|
| 45 |
+
"DataStreaming": 0.03981828689575195,
|
| 46 |
+
"DeConcat": 0.01120138168334961,
|
| 47 |
+
"DeadCodeElimination": 0.010882377624511719,
|
| 48 |
+
"DeadStoreElimination": 0.010195016860961914,
|
| 49 |
+
"DelinearIndices": 0.010077953338623047,
|
| 50 |
+
"Delinearization": 0.011870861053466797,
|
| 51 |
+
"DelinearizeSPMD": 0.035944223403930664,
|
| 52 |
+
"DoNothing": 0.0005605220794677734,
|
| 53 |
+
"DramToDramTranspose": 0.013046979904174805,
|
| 54 |
+
"DumpGraphAndMetadata": 0.03416705131530762,
|
| 55 |
+
"EliminateDivs": 0.004259586334228516,
|
| 56 |
+
"ExpandBatchNorm": 0.0017371177673339844,
|
| 57 |
+
"ExpandISAMacro": 0.014496326446533203,
|
| 58 |
+
"FactorizeBlkDims": 0.07086968421936035,
|
| 59 |
+
"FactorizeThreadAxesInFreeDims": 0.00911855697631836,
|
| 60 |
+
"FlattenMacroLoop": 0.0048520565032958984,
|
| 61 |
+
"GenericAccessSimplifier": 0.001367330551147461,
|
| 62 |
+
"HoistCompute": 6.000000212225132e-06,
|
| 63 |
+
"IdentifyCrossPassTensors": 6.199999916134402e-05,
|
| 64 |
+
"InferInitValue": 0.0836641788482666,
|
| 65 |
+
"InferIntrinsicOnCC": 0.008740901947021484,
|
| 66 |
+
"InferNeuronTensor": 0.05709338188171387,
|
| 67 |
+
"InferNonlocalTensors": 0.041548728942871094,
|
| 68 |
+
"InferPSumTensor": 0.23330998420715332,
|
| 69 |
+
"InferShardAxis": 0.5781030654907227,
|
| 70 |
+
"InferSharedMemLoc": 0.03158235549926758,
|
| 71 |
+
"InlineNativeKernels": 0.002477407455444336,
|
| 72 |
+
"InsertCoreBarrier": 0.015990734100341797,
|
| 73 |
+
"InsertIOTransposes": 0.039937734603881836,
|
| 74 |
+
"InsertImplicitShardAxisBeforeISel": 0.013466596603393555,
|
| 75 |
+
"InsertLocalTransposes": 0.018125534057617188,
|
| 76 |
+
"InsertOffloadedTransposes": 0.014874696731567383,
|
| 77 |
+
"LICM": 0.0058231353759765625,
|
| 78 |
+
"LateLegalizeInst": 0.037004947662353516,
|
| 79 |
+
"LateLegalizePostSplit": 0.02429652214050293,
|
| 80 |
+
"LateLowerReshapeOp": 0.0018832683563232422,
|
| 81 |
+
"LateLowerTensorOp": 0.0021920204162597656,
|
| 82 |
+
"LateNeuronInstComb": 0.06391644477844238,
|
| 83 |
+
"LayoutPreprocessing": 0.06973385810852051,
|
| 84 |
+
"LayoutPreprocessingAndAnalysis": 0.11140203475952148,
|
| 85 |
+
"LayoutRequirementAnalysis": 0.013022661209106445,
|
| 86 |
+
"LegalizeCCOpLayout": 0.0020427703857421875,
|
| 87 |
+
"LegalizeOpLevelAlias": 0.0016918182373046875,
|
| 88 |
+
"LegalizePartitionReduce": 0.0030241012573242188,
|
| 89 |
+
"LegalizeSundaAccess": 0.08372640609741211,
|
| 90 |
+
"LegalizeSundaMacro": 0.02708148956298828,
|
| 91 |
+
"LegalizeType": 0.04078388214111328,
|
| 92 |
+
"LocalLayoutOpt": 0.022045135498046875,
|
| 93 |
+
"LoopFusion": 0.029404163360595703,
|
| 94 |
+
"LoopSplitting": 0.0007355213165283203,
|
| 95 |
+
"LowerBroadcast": 0.02869558334350586,
|
| 96 |
+
"LowerCCOpBlockAxis": 0.007714748382568359,
|
| 97 |
+
"LowerComplexBroadcast": 0.005654096603393555,
|
| 98 |
+
"LowerIntrinsics": 0.051032304763793945,
|
| 99 |
+
"LowerShardAxis": 0.03305673599243164,
|
| 100 |
+
"LowerTensorOp": 0.028458356857299805,
|
| 101 |
+
"LowerToSendRecv": 0.03391242027282715,
|
| 102 |
+
"LowerTranspose": 0.051642656326293945,
|
| 103 |
+
"MacroGeneration": 0.06428074836730957,
|
| 104 |
+
"MaskPropagation": 0.0036263465881347656,
|
| 105 |
+
"MemcastMotion": 1.700000029813964e-05,
|
| 106 |
+
"MemcpyElimination": 0.05451250076293945,
|
| 107 |
+
"MutateDataType": 0.001516103744506836,
|
| 108 |
+
"NeuronAliasDependencyInduction": 0.0005834102630615234,
|
| 109 |
+
"NeuronAliasDependencyReset": 0.022034168243408203,
|
| 110 |
+
"NeuronInstComb": 0.06097984313964844,
|
| 111 |
+
"NeuronLICM": 0.05481839179992676,
|
| 112 |
+
"NeuronLoopFusion": 0.07339620590209961,
|
| 113 |
+
"NeuronLoopInterchange": 0.0027348995208740234,
|
| 114 |
+
"NeuronSimplifier": 0.021918296813964844,
|
| 115 |
+
"NeuronSimplifyPredicates": 0.024098873138427734,
|
| 116 |
+
"NeuronValueNumbering": 0.022985458374023438,
|
| 117 |
+
"OptimizeAliasedCopyChain": 0.0008976459503173828,
|
| 118 |
+
"OptimizeNKIKernels": 4.611967086791992,
|
| 119 |
+
"PAGLayoutOpt": 0.2917053699493408,
|
| 120 |
+
"PComputeCutting": 0.008776664733886719,
|
| 121 |
+
"PGLayoutTilingPipeline": 1.8517823219299316,
|
| 122 |
+
"PGTiling": 0.26313185691833496,
|
| 123 |
+
"PadElimination": 0.0006458759307861328,
|
| 124 |
+
"ParAxesAnnotation": 0.188338041305542,
|
| 125 |
+
"PartialLoopFusion": 0.05682229995727539,
|
| 126 |
+
"PartialSimdFusion": 0.0237729549407959,
|
| 127 |
+
"PenguinizeFunctions": 5.5999997130129486e-05,
|
| 128 |
+
"PerfectLoopNest": 0.00557398796081543,
|
| 129 |
+
"PruneFunctions": 3.9999998989515007e-05,
|
| 130 |
+
"RecognizeOpIdiom": 0.008669376373291016,
|
| 131 |
+
"Recompute": 0.0005908012390136719,
|
| 132 |
+
"RelaxPredicates": 0.006473541259765625,
|
| 133 |
+
"Rematerialization": 0.011237144470214844,
|
| 134 |
+
"RemoveOptimizationBarriers": 7.400000322377309e-05,
|
| 135 |
+
"RemoveShardedPartitionAxes": 0.014671802520751953,
|
| 136 |
+
"ReshapeWeights": 0.0018546581268310547,
|
| 137 |
+
"ResolveAccessConflict": 0.008959770202636719,
|
| 138 |
+
"ResolveComplicatePredicates": 0.0009264945983886719,
|
| 139 |
+
"RewriteReplicationMatmul": 0.0037200450897216797,
|
| 140 |
+
"RewriteWeights": 0.008005380630493164,
|
| 141 |
+
"SFKVectorizer": 0.2967853546142578,
|
| 142 |
+
"ScatterMotion": 1.900000097521115e-05,
|
| 143 |
+
"ShardingPropagationAnalysis": 0.10689902305603027,
|
| 144 |
+
"SimpleAllReduceTiling": 0.010908842086791992,
|
| 145 |
+
"Simplifier": 0.00808858871459961,
|
| 146 |
+
"SimplifyMacroPredicates": 0.031823158264160156,
|
| 147 |
+
"SimplifyNeuronTensor": 0.12780547142028809,
|
| 148 |
+
"SimplifySlice": 0.001531362533569336,
|
| 149 |
+
"SimplifyTensor": 0.018309593200683594,
|
| 150 |
+
"SpillPSum": 0.09417366981506348,
|
| 151 |
+
"SplitAPUnionSets": 0.09693408012390137,
|
| 152 |
+
"SplitAccGrp": 0.0025701522827148438,
|
| 153 |
+
"StaticProfiler": 0.04053521156311035,
|
| 154 |
+
"StaticTransposeLocalTensor": 0.012635231018066406,
|
| 155 |
+
"SundaISel": 0.10333561897277832,
|
| 156 |
+
"TCTransform": 0.006776332855224609,
|
| 157 |
+
"TensorInitialization": 0.011014938354492188,
|
| 158 |
+
"TensorOpSimplifier": 0.005452632904052734,
|
| 159 |
+
"TensorOpTransform": 0.033481597900390625,
|
| 160 |
+
"TensorizerLegalizationPass": 6.399999983841553e-05,
|
| 161 |
+
"TileCCOps": 0.011636972427368164,
|
| 162 |
+
"TilingProfiler": 0.024947643280029297,
|
| 163 |
+
"TransformConvOp": 0.013001441955566406,
|
| 164 |
+
"TritiumFusion": 0.1458723545074463,
|
| 165 |
+
"ValueNumbering": 0.003311634063720703,
|
| 166 |
+
"VectorizeDMA": 0.005986928939819336,
|
| 167 |
+
"VectorizeMatMult": 0.028806686401367188,
|
| 168 |
+
"VerifySupportedOps": 5.100000271340832e-05,
|
| 169 |
+
"WeightCoalescing": 0.01451730728149414,
|
| 170 |
+
"ZeroSizeTensorElimination": 0.00017833709716796875,
|
| 171 |
+
"algsimp": 0.0020910000894218683,
|
| 172 |
+
"batchnorm_expander": 5.0000002374872565e-05,
|
| 173 |
+
"boundary-marker-removal": 1.900000097521115e-05,
|
| 174 |
+
"call-inliner": 0.00046300000394694507,
|
| 175 |
+
"canonicalize-boundary-marker": 2.300000051036477e-05,
|
| 176 |
+
"collective-stream-id-checker": 8.800000068731606e-05,
|
| 177 |
+
"comparison-expander": 0.0005719999899156392,
|
| 178 |
+
"computation-deduplicator": 8.399999933317304e-05,
|
| 179 |
+
"config-lowering": 0.00016599999798927456,
|
| 180 |
+
"constant-statistics": 0.0004529999860096723,
|
| 181 |
+
"constant_folding": 0.00018699999782256782,
|
| 182 |
+
"cse": 6.299999949987978e-05,
|
| 183 |
+
"dce": 4.400000034365803e-05,
|
| 184 |
+
"dot_decomposer": 0.001028000027872622,
|
| 185 |
+
"dynamic-slice-transpose": 2.0000001313746907e-05,
|
| 186 |
+
"eliminate-redundant-compare": 0.00014699999883305281,
|
| 187 |
+
"emit-offloaded-dropout": 5.499999679159373e-05,
|
| 188 |
+
"flatten-call-graph": 0.0006470000371336937,
|
| 189 |
+
"fuse-send-recv": 9.600000339560211e-05,
|
| 190 |
+
"hilo-conditional-to-select": 2.9000000722589903e-05,
|
| 191 |
+
"hilo::LegalizeAlias": 1.500000053056283e-05,
|
| 192 |
+
"hilo::NeuronInstCombine": 0.00012700000661425292,
|
| 193 |
+
"hilo::NeuronOpFusion": 4.099999932805076e-05,
|
| 194 |
+
"hilo::ReplaceTokenTypeWithU8Pass": 7.400000322377309e-05,
|
| 195 |
+
"hilo::ScheduleFusion": 3.000000106112566e-06,
|
| 196 |
+
"hilo::SixtyFourHack": 9.599999611964449e-05,
|
| 197 |
+
"hilo::VerifyAliasing": 6.000000212225132e-06,
|
| 198 |
+
"hlo-mac-count": 0.015143999829888344,
|
| 199 |
+
"instruction-histogram": 0.0010160000529140234,
|
| 200 |
+
"io-con-pipe-begin": 4.999999873689376e-06,
|
| 201 |
+
"io-con-pipe-end": 0.0,
|
| 202 |
+
"io-layout-normalization": 0.0007440000190399587,
|
| 203 |
+
"io-statistics": 3.9999998989515007e-05,
|
| 204 |
+
"legalize-ccops-for-tensorizer": 6.000000212225132e-06,
|
| 205 |
+
"legalize-compare": 1.3999999282532372e-05,
|
| 206 |
+
"lower-argminmax-custom-call": 1.5999999959603883e-05,
|
| 207 |
+
"map-inline": 0.0008340000058524311,
|
| 208 |
+
"metadata-naming": 7.79999973019585e-05,
|
| 209 |
+
"mlir::detail::OpToOpPassAdaptor": 5.8000001445179805e-05,
|
| 210 |
+
"mlir::hlo::MhloToPyPenguin": 0.013376999646425247,
|
| 211 |
+
"mlir::mhlo::LowerComplexExtraPass": 0.00022300001000985503,
|
| 212 |
+
"mlir::mhlo::LowerComplexPass": 0.0004149999876972288,
|
| 213 |
+
"native-to-custom-softmax": 0.0003029999788850546,
|
| 214 |
+
"native-to-custom-softmax-dx": 0.0021089999936521053,
|
| 215 |
+
"neuron-hlo-verifier": 0.011952999979257584,
|
| 216 |
+
"operand_upcaster": 5.5999997130129486e-05,
|
| 217 |
+
"opt-barrier-removal": 0.00026000000070780516,
|
| 218 |
+
"post-par-pipe-begin": 0.0003480000013951212,
|
| 219 |
+
"post-par-pipe-end": 0.0,
|
| 220 |
+
"post-partition-simplification": 0.002303000073879957,
|
| 221 |
+
"pre-par-pipe-begin": 9.999999974752427e-07,
|
| 222 |
+
"pre-par-pipe-end": 0.0,
|
| 223 |
+
"pre-partition-simplification": 0.07090699672698975,
|
| 224 |
+
"replace-minimum-constant": 0.0003819999983534217,
|
| 225 |
+
"reshape-mover": 6.299999949987978e-05,
|
| 226 |
+
"simplify-concat": 0.00014800000644754618,
|
| 227 |
+
"simplify-while-loops": 9.100000170292333e-05,
|
| 228 |
+
"transform-variadic-reduce": 9.299999510403723e-05,
|
| 229 |
+
"tuple-simplifier": 0.0001649999903747812,
|
| 230 |
+
"unpack-nested-aws-ntwsr": 0.00024099998699966818,
|
| 231 |
+
"unroll-while-loop": 3.5000000934815034e-05,
|
| 232 |
+
"zero_sized_hlo_elimination": 0.00072900002123788
|
| 233 |
+
},
|
| 234 |
+
"hilo": {
|
| 235 |
+
"ConstantSize": 3678847.0,
|
| 236 |
+
"HloInputCount": 371.0,
|
| 237 |
+
"HloMacCount": 111825780736.0,
|
| 238 |
+
"HloOutputCount": 57.0,
|
| 239 |
+
"IfmapSize": 3910928384.0,
|
| 240 |
+
"OfmapSize": 1879048192.0,
|
| 241 |
+
"OutputsReadFromCount": 0.0,
|
| 242 |
+
"PassthroughTensorsCount": 0.0,
|
| 243 |
+
"RedundantOutputCount": 0.0,
|
| 244 |
+
"Traffic": 973052032.0
|
| 245 |
+
},
|
| 246 |
+
"tensorizer": {
|
| 247 |
+
"DMATilingProfiler::TotalInstructionsAfterTiling": 25519.0,
|
| 248 |
+
"StaticProfiler::AifUb": 337.1839904785156,
|
| 249 |
+
"StaticProfiler::ArithmeticIntensityTensorizer": 248.63792419433594,
|
| 250 |
+
"StaticProfiler::AverageDmaLength": 2413.602294921875,
|
| 251 |
+
"StaticProfiler::DDRTransferBytes": 495991840.0,
|
| 252 |
+
"StaticProfiler::InternalTransferBytes": 361682720.0,
|
| 253 |
+
"StaticProfiler::LoadExpanded": 133728.0,
|
| 254 |
+
"StaticProfiler::StoreExpanded": 7530.0,
|
| 255 |
+
"StaticProfiler::TotalDMAExpanded": 141258.0,
|
| 256 |
+
"StaticProfiler::TotalDynamicInstancesCount": 30781.0,
|
| 257 |
+
"StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 30330.0,
|
| 258 |
+
"StaticProfiler::TotalLNCComm": 0.0,
|
| 259 |
+
"StaticProfiler::TotalLNCCommTransfer": 0.0,
|
| 260 |
+
"TilingProfiler::BatchnormInstructionsAfterTiling": 0.0,
|
| 261 |
+
"TilingProfiler::DmaInstructionsAfterTiling": 0.0,
|
| 262 |
+
"TilingProfiler::GenericInstructionsAfterTiling": 4.0,
|
| 263 |
+
"TilingProfiler::MatMultInstructionsAfterTiling": 14112.0,
|
| 264 |
+
"TilingProfiler::NumPfTransposes": 5.0,
|
| 265 |
+
"TilingProfiler::NumPfTransposesForIo": 1.0,
|
| 266 |
+
"TilingProfiler::NumPfTransposesForLocal": 1.0,
|
| 267 |
+
"TilingProfiler::NumPfTransposesForNonlocal": 3.0,
|
| 268 |
+
"TilingProfiler::PfTransposeInstructions": 10273.0,
|
| 269 |
+
"TilingProfiler::PfTransposeInstructionsForIo": 9504.0,
|
| 270 |
+
"TilingProfiler::PfTransposeInstructionsForLocal": 1.0,
|
| 271 |
+
"TilingProfiler::PfTransposeInstructionsForNonlocal": 768.0,
|
| 272 |
+
"TilingProfiler::ReduceInstructionsAfterTiling": 10.0,
|
| 273 |
+
"TilingProfiler::SimdInstructionsAfterTiling": 311.0,
|
| 274 |
+
"TilingProfiler::TotalInstructionsAfterTiling": 0.0,
|
| 275 |
+
"TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0,
|
| 276 |
+
"TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0,
|
| 277 |
+
"TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0,
|
| 278 |
+
"TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0,
|
| 279 |
+
"TransformConvOp::conv2d_column_packing": 0.0,
|
| 280 |
+
"TransformConvOp::conv2d_column_packing_1": 0.0,
|
| 281 |
+
"TransformConvOp::conv2d_column_packing_io10": 0.0,
|
| 282 |
+
"TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0
|
| 283 |
+
}
|
| 284 |
+
},
|
| 285 |
+
"all": {
|
| 286 |
+
"compiletime": {
|
| 287 |
+
"algsimp": 0.0017770000267773867,
|
| 288 |
+
"call-inliner": 0.00041700000292621553,
|
| 289 |
+
"collective-stream-id-checker": 5.500000042957254e-05,
|
| 290 |
+
"comparison-expander": 0.0005280000041238964,
|
| 291 |
+
"constant-statistics": 0.0004529999860096723,
|
| 292 |
+
"constant_folding": 0.0001429999974789098,
|
| 293 |
+
"dce": 3.899999865097925e-05,
|
| 294 |
+
"dot_decomposer": 0.001028000027872622,
|
| 295 |
+
"eliminate-redundant-compare": 0.0001320000010309741,
|
| 296 |
+
"flatten-call-graph": 0.0006070000235922635,
|
| 297 |
+
"hlo-mac-count": 0.007338999770581722,
|
| 298 |
+
"instruction-histogram": 0.0010160000529140234,
|
| 299 |
+
"io-con-pipe-begin": 4.999999873689376e-06,
|
| 300 |
+
"io-con-pipe-end": 0.0,
|
| 301 |
+
"io-layout-normalization": 0.0007440000190399587,
|
| 302 |
+
"io-statistics": 3.9999998989515007e-05,
|
| 303 |
+
"map-inline": 0.0007900000200606883,
|
| 304 |
+
"native-to-custom-softmax": 0.00028199999360367656,
|
| 305 |
+
"native-to-custom-softmax-dx": 0.00042699999175965786,
|
| 306 |
+
"neuron-hlo-verifier": 0.010262000374495983,
|
| 307 |
+
"opt-barrier-removal": 0.00026000000070780516,
|
| 308 |
+
"pre-par-pipe-begin": 9.999999974752427e-07,
|
| 309 |
+
"pre-par-pipe-end": 0.0,
|
| 310 |
+
"pre-partition-simplification": 0.07090699672698975,
|
| 311 |
+
"replace-minimum-constant": 0.0003480000013951212,
|
| 312 |
+
"reshape-mover": 4.8999998398358e-05,
|
| 313 |
+
"simplify-while-loops": 8.099999831756577e-05,
|
| 314 |
+
"tuple-simplifier": 0.0001429999974789098,
|
| 315 |
+
"unpack-nested-aws-ntwsr": 0.00022499999613501132,
|
| 316 |
+
"unroll-while-loop": 1.2000000424450263e-05,
|
| 317 |
+
"zero_sized_hlo_elimination": 0.00072900002123788
|
| 318 |
+
}
|
| 319 |
+
},
|
| 320 |
+
"attention_isa_kernel": {
|
| 321 |
+
"compiletime": {
|
| 322 |
+
"CoalesceCCOp": 0.00029277801513671875,
|
| 323 |
+
"DMALocalityOpt": 0.00019669532775878906,
|
| 324 |
+
"DMAProfiler": 0.0002949237823486328,
|
| 325 |
+
"DataStreaming": 0.0002338886260986328,
|
| 326 |
+
"DoNothing": 0.0014209747314453125,
|
| 327 |
+
"ExpandISAMacro": 0.00028014183044433594,
|
| 328 |
+
"FactorizeBlkDims": 0.0051081180572509766,
|
| 329 |
+
"InferPSumTensor": 0.0036172866821289063,
|
| 330 |
+
"InferSharedMemLoc": 0.0005719661712646484,
|
| 331 |
+
"InsertCoreBarrier": 0.0023279190063476563,
|
| 332 |
+
"LateLegalizeInst": 0.0016858577728271484,
|
| 333 |
+
"LateNeuronInstComb": 0.00044226646423339844,
|
| 334 |
+
"LegalizeSundaAccess": 0.0002193450927734375,
|
| 335 |
+
"LegalizeType": 0.002800464630126953,
|
| 336 |
+
"LowerBroadcast": 0.0002620220184326172,
|
| 337 |
+
"LowerIntrinsics": 0.0003139972686767578,
|
| 338 |
+
"LowerTranspose": 0.0002512931823730469,
|
| 339 |
+
"NeuronInstComb": 0.0005278587341308594,
|
| 340 |
+
"NeuronLICM": 0.0002562999725341797,
|
| 341 |
+
"NeuronSimplifyPredicates": 0.0002334117889404297,
|
| 342 |
+
"NeuronValueNumbering": 0.0002815723419189453,
|
| 343 |
+
"SFKVectorizer": 0.005394458770751953,
|
| 344 |
+
"SimpleAllReduceTiling": 0.0003223419189453125,
|
| 345 |
+
"SimplifyNeuronTensor": 0.0007545948028564453,
|
| 346 |
+
"SpillPSum": 0.0006477832794189453,
|
| 347 |
+
"WeightCoalescing": 0.00023102760314941406
|
| 348 |
+
}
|
| 349 |
+
},
|
| 350 |
+
"cumsum": {
|
| 351 |
+
"compiletime": {
|
| 352 |
+
"CoalesceCCOp": 0.00034165382385253906,
|
| 353 |
+
"DMALocalityOpt": 0.0003287792205810547,
|
| 354 |
+
"DMAProfiler": 0.001161336898803711,
|
| 355 |
+
"DataStreaming": 0.0004813671112060547,
|
| 356 |
+
"DoNothing": 0.00018596649169921875,
|
| 357 |
+
"ExpandISAMacro": 0.0008256435394287109,
|
| 358 |
+
"FactorizeBlkDims": 0.0007493495941162109,
|
| 359 |
+
"InferPSumTensor": 0.0011432170867919922,
|
| 360 |
+
"InferSharedMemLoc": 0.00045013427734375,
|
| 361 |
+
"InsertCoreBarrier": 0.00044918060302734375,
|
| 362 |
+
"LateLegalizeInst": 0.0019235610961914063,
|
| 363 |
+
"LateNeuronInstComb": 0.0011394023895263672,
|
| 364 |
+
"LegalizeSundaAccess": 0.002297639846801758,
|
| 365 |
+
"LegalizeType": 0.00036334991455078125,
|
| 366 |
+
"LowerBroadcast": 0.0003592967987060547,
|
| 367 |
+
"LowerIntrinsics": 0.000362396240234375,
|
| 368 |
+
"LowerTranspose": 0.0003514289855957031,
|
| 369 |
+
"NeuronInstComb": 0.0034132003784179688,
|
| 370 |
+
"NeuronLICM": 0.0006377696990966797,
|
| 371 |
+
"NeuronSimplifyPredicates": 0.0035140514373779297,
|
| 372 |
+
"NeuronValueNumbering": 0.001703023910522461,
|
| 373 |
+
"SFKVectorizer": 0.009377241134643555,
|
| 374 |
+
"SimpleAllReduceTiling": 0.0003190040588378906,
|
| 375 |
+
"SimplifyNeuronTensor": 0.0036399364471435547,
|
| 376 |
+
"SpillPSum": 0.0008790493011474609,
|
| 377 |
+
"WeightCoalescing": 0.0003619194030761719
|
| 378 |
+
}
|
| 379 |
+
},
|
| 380 |
+
"sg00": {
|
| 381 |
+
"compiletime": {
|
| 382 |
+
"CanonicalizeConv": 1.9999999949504854e-06,
|
| 383 |
+
"CanonicalizeForTensorizer": 1.8999999156221747e-05,
|
| 384 |
+
"Canonicalizer": 0.0004579999949783087,
|
| 385 |
+
"HoistCompute": 3.000000106112566e-06,
|
| 386 |
+
"IdentifyCrossPassTensors": 2.5999999706982635e-05,
|
| 387 |
+
"MemcastMotion": 9.999999747378752e-06,
|
| 388 |
+
"PenguinizeFunctions": 1.9999999494757503e-05,
|
| 389 |
+
"PruneFunctions": 1.2000000424450263e-05,
|
| 390 |
+
"RemoveOptimizationBarriers": 2.499999936844688e-05,
|
| 391 |
+
"ScatterMotion": 9.999999747378752e-06,
|
| 392 |
+
"TensorizerLegalizationPass": 3.199999991920777e-05,
|
| 393 |
+
"VerifySupportedOps": 1.700000029813964e-05,
|
| 394 |
+
"algsimp": 8.900000102585182e-05,
|
| 395 |
+
"batchnorm_expander": 1.700000029813964e-05,
|
| 396 |
+
"boundary-marker-removal": 6.000000212225132e-06,
|
| 397 |
+
"call-inliner": 1.2999999853491317e-05,
|
| 398 |
+
"canonicalize-boundary-marker": 7.999999979801942e-06,
|
| 399 |
+
"collective-stream-id-checker": 2.499999936844688e-05,
|
| 400 |
+
"comparison-expander": 6.000000212225132e-06,
|
| 401 |
+
"computation-deduplicator": 2.499999936844688e-05,
|
| 402 |
+
"config-lowering": 5.0999999075429514e-05,
|
| 403 |
+
"constant_folding": 1.2999999853491317e-05,
|
| 404 |
+
"cse": 1.9999999494757503e-05,
|
| 405 |
+
"dce": 1.9999999949504854e-06,
|
| 406 |
+
"dynamic-slice-transpose": 7.000000096013537e-06,
|
| 407 |
+
"eliminate-redundant-compare": 4.999999873689376e-06,
|
| 408 |
+
"emit-offloaded-dropout": 1.8999999156221747e-05,
|
| 409 |
+
"flatten-call-graph": 1.2000000424450263e-05,
|
| 410 |
+
"fuse-send-recv": 2.8000000384054147e-05,
|
| 411 |
+
"hilo-conditional-to-select": 7.999999979801942e-06,
|
| 412 |
+
"hilo::LegalizeAlias": 6.000000212225132e-06,
|
| 413 |
+
"hilo::NeuronInstCombine": 5.900000178371556e-05,
|
| 414 |
+
"hilo::NeuronOpFusion": 1.1000000085914508e-05,
|
| 415 |
+
"hilo::ReplaceTokenTypeWithU8Pass": 3.5000000934815034e-05,
|
| 416 |
+
"hilo::ScheduleFusion": 9.999999974752427e-07,
|
| 417 |
+
"hilo::SixtyFourHack": 1.8000000636675395e-05,
|
| 418 |
+
"hilo::VerifyAliasing": 3.000000106112566e-06,
|
| 419 |
+
"hlo-mac-count": 0.00014400000509340316,
|
| 420 |
+
"legalize-ccops-for-tensorizer": 3.000000106112566e-06,
|
| 421 |
+
"legalize-compare": 4.999999873689376e-06,
|
| 422 |
+
"lower-argminmax-custom-call": 4.999999873689376e-06,
|
| 423 |
+
"map-inline": 1.4000000192027073e-05,
|
| 424 |
+
"metadata-naming": 2.4000000848900527e-05,
|
| 425 |
+
"mlir::detail::OpToOpPassAdaptor": 1.9999999494757503e-05,
|
| 426 |
+
"mlir::hlo::MhloToPyPenguin": 0.0029299999587237835,
|
| 427 |
+
"mlir::mhlo::LowerComplexExtraPass": 7.200000254670158e-05,
|
| 428 |
+
"mlir::mhlo::LowerComplexPass": 0.00014200000441633165,
|
| 429 |
+
"native-to-custom-softmax": 7.999999979801942e-06,
|
| 430 |
+
"native-to-custom-softmax-dx": 0.0016329999780282378,
|
| 431 |
+
"neuron-hlo-verifier": 0.000598000013269484,
|
| 432 |
+
"operand_upcaster": 1.9999999494757503e-05,
|
| 433 |
+
"post-par-pipe-begin": 0.00034500000765547156,
|
| 434 |
+
"post-par-pipe-end": 0.0,
|
| 435 |
+
"post-partition-simplification": 0.0007699999841861427,
|
| 436 |
+
"replace-minimum-constant": 9.999999747378752e-06,
|
| 437 |
+
"reshape-mover": 4.999999873689376e-06,
|
| 438 |
+
"simplify-concat": 4.70000013592653e-05,
|
| 439 |
+
"simplify-while-loops": 3.000000106112566e-06,
|
| 440 |
+
"transform-variadic-reduce": 1.1000000085914508e-05,
|
| 441 |
+
"tuple-simplifier": 7.000000096013537e-06,
|
| 442 |
+
"unpack-nested-aws-ntwsr": 4.999999873689376e-06,
|
| 443 |
+
"unroll-while-loop": 9.999999974752427e-07
|
| 444 |
+
},
|
| 445 |
+
"hilo": {
|
| 446 |
+
"ArithmeticIntensity": 79.95455932617188,
|
| 447 |
+
"ConstantSize": 3678847.0,
|
| 448 |
+
"HloInputCount": 371.0,
|
| 449 |
+
"HloMacCount": 17179869184.0,
|
| 450 |
+
"HloOutputCount": 57.0,
|
| 451 |
+
"IfmapSize": 3910928384.0,
|
| 452 |
+
"OfmapSize": 1879048192.0,
|
| 453 |
+
"OutputsReadFromCount": 0.0,
|
| 454 |
+
"PassthroughTensorsCount": 0.0,
|
| 455 |
+
"RedundantOutputCount": 0.0,
|
| 456 |
+
"Traffic": 429740832.0
|
| 457 |
+
}
|
| 458 |
+
},
|
| 459 |
+
"sg0000": {
|
| 460 |
+
"compiletime": {
|
| 461 |
+
"AGOrderingAnalysisPass": 0.06203174591064453,
|
| 462 |
+
"AffinePredicateResolution": 0.001997709274291992,
|
| 463 |
+
"AliasDependencyElimination": 0.00024080276489257813,
|
| 464 |
+
"AliasDependencyInduction": 0.0331728458404541,
|
| 465 |
+
"AliasDependencyReset": 0.10205578804016113,
|
| 466 |
+
"BFComputeCutting": 0.007540702819824219,
|
| 467 |
+
"BirCodeGenLoop": 0.15983891487121582,
|
| 468 |
+
"CCOpFusion": 0.06544995307922363,
|
| 469 |
+
"CanonicalizeDAGForPGTiling": 0.004024982452392578,
|
| 470 |
+
"CanonicalizeIR": 0.001623392105102539,
|
| 471 |
+
"CoalesceCCOp": 0.011837482452392578,
|
| 472 |
+
"CommuteConcat": 0.009541988372802734,
|
| 473 |
+
"DMALocalityOpt": 0.0019822120666503906,
|
| 474 |
+
"DMAProfiler": 0.007272958755493164,
|
| 475 |
+
"DMATilingProfiler": 0.007293224334716797,
|
| 476 |
+
"DataLocalityOpt": 0.2593100070953369,
|
| 477 |
+
"DataStreaming": 0.0239105224609375,
|
| 478 |
+
"DeConcat": 0.005833864212036133,
|
| 479 |
+
"DeadCodeElimination": 0.00394749641418457,
|
| 480 |
+
"DeadStoreElimination": 0.07077240943908691,
|
| 481 |
+
"DelinearIndices": 0.02637171745300293,
|
| 482 |
+
"Delinearization": 0.01995396614074707,
|
| 483 |
+
"DelinearizeSPMD": 0.03704118728637695,
|
| 484 |
+
"DoNothing": 9.799003601074219e-05,
|
| 485 |
+
"DramToDramTranspose": 0.03482198715209961,
|
| 486 |
+
"DumpGraphAndMetadata": 0.01542520523071289,
|
| 487 |
+
"EliminateDivs": 0.005273103713989258,
|
| 488 |
+
"ExpandBatchNorm": 0.0026073455810546875,
|
| 489 |
+
"ExpandISAMacro": 0.008665800094604492,
|
| 490 |
+
"FactorizeBlkDims": 0.061437368392944336,
|
| 491 |
+
"FactorizeThreadAxesInFreeDims": 0.002484560012817383,
|
| 492 |
+
"FlattenMacroLoop": 0.008157968521118164,
|
| 493 |
+
"GenericAccessSimplifier": 0.0014643669128417969,
|
| 494 |
+
"InferInitValue": 0.08534860610961914,
|
| 495 |
+
"InferIntrinsicOnCC": 0.01716780662536621,
|
| 496 |
+
"InferNeuronTensor": 0.09510421752929688,
|
| 497 |
+
"InferNonlocalTensors": 0.16463732719421387,
|
| 498 |
+
"InferPSumTensor": 0.09516620635986328,
|
| 499 |
+
"InferShardAxis": 0.5436458587646484,
|
| 500 |
+
"InferSharedMemLoc": 0.013478994369506836,
|
| 501 |
+
"InlineNativeKernels": 0.0027844905853271484,
|
| 502 |
+
"InsertCoreBarrier": 0.008362293243408203,
|
| 503 |
+
"InsertIOTransposes": 0.07836699485778809,
|
| 504 |
+
"InsertImplicitShardAxisBeforeISel": 0.008057355880737305,
|
| 505 |
+
"InsertLocalTransposes": 0.01099085807800293,
|
| 506 |
+
"InsertOffloadedTransposes": 0.03647184371948242,
|
| 507 |
+
"LICM": 0.005979299545288086,
|
| 508 |
+
"LateLegalizeInst": 0.012919187545776367,
|
| 509 |
+
"LateLegalizePostSplit": 0.007997751235961914,
|
| 510 |
+
"LateLowerReshapeOp": 0.011852502822875977,
|
| 511 |
+
"LateLowerTensorOp": 0.007149696350097656,
|
| 512 |
+
"LateNeuronInstComb": 0.053853750228881836,
|
| 513 |
+
"LayoutPreprocessing": 0.07254910469055176,
|
| 514 |
+
"LayoutPreprocessingAndAnalysis": 0.13735532760620117,
|
| 515 |
+
"LayoutRequirementAnalysis": 0.012064695358276367,
|
| 516 |
+
"LegalizeCCOpLayout": 0.003309011459350586,
|
| 517 |
+
"LegalizeOpLevelAlias": 0.004944324493408203,
|
| 518 |
+
"LegalizePartitionReduce": 0.002275705337524414,
|
| 519 |
+
"LegalizeSundaAccess": 0.13529706001281738,
|
| 520 |
+
"LegalizeSundaMacro": 0.017252445220947266,
|
| 521 |
+
"LegalizeType": 0.007556915283203125,
|
| 522 |
+
"LocalLayoutOpt": 0.04438447952270508,
|
| 523 |
+
"LoopFusion": 0.018953561782836914,
|
| 524 |
+
"LoopSplitting": 0.0016851425170898438,
|
| 525 |
+
"LowerBroadcast": 0.005589485168457031,
|
| 526 |
+
"LowerCCOpBlockAxis": 0.009353399276733398,
|
| 527 |
+
"LowerComplexBroadcast": 0.011426210403442383,
|
| 528 |
+
"LowerIntrinsics": 0.04210019111633301,
|
| 529 |
+
"LowerShardAxis": 0.014751195907592773,
|
| 530 |
+
"LowerTensorOp": 0.02877187728881836,
|
| 531 |
+
"LowerToSendRecv": 0.006161689758300781,
|
| 532 |
+
"LowerTranspose": 0.02186894416809082,
|
| 533 |
+
"MacroGeneration": 0.1734302043914795,
|
| 534 |
+
"MaskPropagation": 0.014665842056274414,
|
| 535 |
+
"MemcpyElimination": 0.3008904457092285,
|
| 536 |
+
"MutateDataType": 0.0027010440826416016,
|
| 537 |
+
"NeuronAliasDependencyInduction": 0.0006909370422363281,
|
| 538 |
+
"NeuronAliasDependencyReset": 0.022809267044067383,
|
| 539 |
+
"NeuronInstComb": 0.005879402160644531,
|
| 540 |
+
"NeuronLICM": 0.0464015007019043,
|
| 541 |
+
"NeuronLoopFusion": 0.05638718605041504,
|
| 542 |
+
"NeuronLoopInterchange": 0.00871729850769043,
|
| 543 |
+
"NeuronSimplifier": 0.02101302146911621,
|
| 544 |
+
"NeuronSimplifyPredicates": 0.004530191421508789,
|
| 545 |
+
"NeuronValueNumbering": 0.007061004638671875,
|
| 546 |
+
"OptimizeAliasedCopyChain": 0.001558065414428711,
|
| 547 |
+
"OptimizeNKIKernels": 0.3715829849243164,
|
| 548 |
+
"PAGLayoutOpt": 0.648719310760498,
|
| 549 |
+
"PComputeCutting": 0.02423238754272461,
|
| 550 |
+
"PGLayoutTilingPipeline": 2.515984058380127,
|
| 551 |
+
"PGTiling": 0.46158504486083984,
|
| 552 |
+
"PadElimination": 0.0023555755615234375,
|
| 553 |
+
"ParAxesAnnotation": 0.5548486709594727,
|
| 554 |
+
"PartialLoopFusion": 0.04628252983093262,
|
| 555 |
+
"PartialSimdFusion": 0.06029558181762695,
|
| 556 |
+
"PerfectLoopNest": 0.0032892227172851563,
|
| 557 |
+
"RecognizeOpIdiom": 0.01747274398803711,
|
| 558 |
+
"Recompute": 0.00046896934509277344,
|
| 559 |
+
"RelaxPredicates": 0.00874948501586914,
|
| 560 |
+
"Rematerialization": 0.023741722106933594,
|
| 561 |
+
"RemoveShardedPartitionAxes": 0.041913747787475586,
|
| 562 |
+
"ReshapeWeights": 0.0023987293243408203,
|
| 563 |
+
"ResolveAccessConflict": 0.013326883316040039,
|
| 564 |
+
"ResolveComplicatePredicates": 0.0010704994201660156,
|
| 565 |
+
"RewriteReplicationMatmul": 0.00213623046875,
|
| 566 |
+
"RewriteWeights": 0.006081342697143555,
|
| 567 |
+
"SFKVectorizer": 0.5432095527648926,
|
| 568 |
+
"ShardingPropagationAnalysis": 0.04027843475341797,
|
| 569 |
+
"SimpleAllReduceTiling": 0.005087375640869141,
|
| 570 |
+
"Simplifier": 0.008136272430419922,
|
| 571 |
+
"SimplifyMacroPredicates": 0.010492086410522461,
|
| 572 |
+
"SimplifyNeuronTensor": 0.033696889877319336,
|
| 573 |
+
"SimplifySlice": 0.0016849040985107422,
|
| 574 |
+
"SimplifyTensor": 0.013016223907470703,
|
| 575 |
+
"SpillPSum": 0.04322075843811035,
|
| 576 |
+
"SplitAPUnionSets": 0.04480147361755371,
|
| 577 |
+
"SplitAccGrp": 0.0033092498779296875,
|
| 578 |
+
"StaticProfiler": 0.02093505859375,
|
| 579 |
+
"StaticTransposeLocalTensor": 0.011444330215454102,
|
| 580 |
+
"SundaISel": 0.0645599365234375,
|
| 581 |
+
"TCTransform": 0.0017342567443847656,
|
| 582 |
+
"TensorInitialization": 0.014005661010742188,
|
| 583 |
+
"TensorOpSimplifier": 0.010408163070678711,
|
| 584 |
+
"TensorOpTransform": 0.062005043029785156,
|
| 585 |
+
"TileCCOps": 0.007296085357666016,
|
| 586 |
+
"TilingProfiler": 0.04326295852661133,
|
| 587 |
+
"TransformConvOp": 0.004875659942626953,
|
| 588 |
+
"TritiumFusion": 0.12003302574157715,
|
| 589 |
+
"ValueNumbering": 0.007851839065551758,
|
| 590 |
+
"VectorizeDMA": 0.008031368255615234,
|
| 591 |
+
"VectorizeMatMult": 0.030368566513061523,
|
| 592 |
+
"WeightCoalescing": 0.009224891662597656,
|
| 593 |
+
"ZeroSizeTensorElimination": 0.0001709461212158203
|
| 594 |
+
},
|
| 595 |
+
"tensorizer": {
|
| 596 |
+
"DMATilingProfiler::TotalInstructionsAfterTiling": 3453.0,
|
| 597 |
+
"StaticProfiler::AifUb": 66.1578598022461,
|
| 598 |
+
"StaticProfiler::ArithmeticIntensityTensorizer": 256.2751770019531,
|
| 599 |
+
"StaticProfiler::AverageDmaLength": 1973.780029296875,
|
| 600 |
+
"StaticProfiler::AverageFractalPeUtilization": 99.81855773925781,
|
| 601 |
+
"StaticProfiler::AveragePartitionUtilization": 99.43334197998047,
|
| 602 |
+
"StaticProfiler::AveragePeUtilization": 99.31205749511719,
|
| 603 |
+
"StaticProfiler::DDRTransferBytes": 122882568.0,
|
| 604 |
+
"StaticProfiler::InternalTransferBytes": 87572480.0,
|
| 605 |
+
"StaticProfiler::LoadExpanded": 18965.0,
|
| 606 |
+
"StaticProfiler::LocalizationEfficiency": 387.36920166015625,
|
| 607 |
+
"StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 656.1036376953125,
|
| 608 |
+
"StaticProfiler::StoreExpanded": 17921.0,
|
| 609 |
+
"StaticProfiler::TotalDMAExpanded": 36886.0,
|
| 610 |
+
"StaticProfiler::TotalDynamicInstancesCount": 4675.0,
|
| 611 |
+
"StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 4662.0,
|
| 612 |
+
"StaticProfiler::TotalLNCComm": 0.0,
|
| 613 |
+
"StaticProfiler::TotalLNCCommTransfer": 0.0,
|
| 614 |
+
"TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0,
|
| 615 |
+
"TilingProfiler::AveragePeUtilizationAfterTiling": 0.0,
|
| 616 |
+
"TilingProfiler::BatchnormInstructionsAfterTiling": 0.0,
|
| 617 |
+
"TilingProfiler::DmaInstructionsAfterTiling": 0.0,
|
| 618 |
+
"TilingProfiler::GenericInstructionsAfterTiling": 192.0,
|
| 619 |
+
"TilingProfiler::MatMultInstructionsAfterTiling": 1552.0,
|
| 620 |
+
"TilingProfiler::NumPfTransposes": 7.0,
|
| 621 |
+
"TilingProfiler::NumPfTransposesForIo": 1.0,
|
| 622 |
+
"TilingProfiler::NumPfTransposesForLocal": 3.0,
|
| 623 |
+
"TilingProfiler::NumPfTransposesForNonlocal": 3.0,
|
| 624 |
+
"TilingProfiler::PfTransposeInstructions": 896.0,
|
| 625 |
+
"TilingProfiler::PfTransposeInstructionsForIo": 256.0,
|
| 626 |
+
"TilingProfiler::PfTransposeInstructionsForLocal": 256.0,
|
| 627 |
+
"TilingProfiler::PfTransposeInstructionsForNonlocal": 384.0,
|
| 628 |
+
"TilingProfiler::ReduceInstructionsAfterTiling": 0.0,
|
| 629 |
+
"TilingProfiler::SimdInstructionsAfterTiling": 468.0,
|
| 630 |
+
"TilingProfiler::TotalInstructionsAfterTiling": 0.0,
|
| 631 |
+
"TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0,
|
| 632 |
+
"TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0,
|
| 633 |
+
"TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0,
|
| 634 |
+
"TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0,
|
| 635 |
+
"TransformConvOp::conv2d_column_packing": 0.0,
|
| 636 |
+
"TransformConvOp::conv2d_column_packing_1": 0.0,
|
| 637 |
+
"TransformConvOp::conv2d_column_packing_io10": 0.0,
|
| 638 |
+
"TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0
|
| 639 |
+
}
|
| 640 |
+
},
|
| 641 |
+
"sg0001": {
|
| 642 |
+
"compiletime": {
|
| 643 |
+
"AGOrderingAnalysisPass": 0.10222506523132324,
|
| 644 |
+
"AffinePredicateResolution": 0.002437591552734375,
|
| 645 |
+
"AliasDependencyElimination": 0.00020074844360351563,
|
| 646 |
+
"AliasDependencyInduction": 0.030005455017089844,
|
| 647 |
+
"AliasDependencyReset": 0.08542060852050781,
|
| 648 |
+
"BFComputeCutting": 0.009021759033203125,
|
| 649 |
+
"BirCodeGenLoop": 0.0576014518737793,
|
| 650 |
+
"CCOpFusion": 0.07059645652770996,
|
| 651 |
+
"CanonicalizeDAGForPGTiling": 0.011131525039672852,
|
| 652 |
+
"CanonicalizeIR": 0.0030748844146728516,
|
| 653 |
+
"CoalesceCCOp": 0.016925573348999023,
|
| 654 |
+
"CommuteConcat": 0.004233837127685547,
|
| 655 |
+
"DMALocalityOpt": 0.0022597312927246094,
|
| 656 |
+
"DMAProfiler": 0.011726617813110352,
|
| 657 |
+
"DMATilingProfiler": 0.010080099105834961,
|
| 658 |
+
"DataLocalityOpt": 0.45432257652282715,
|
| 659 |
+
"DataStreaming": 0.007066249847412109,
|
| 660 |
+
"DeConcat": 0.010270833969116211,
|
| 661 |
+
"DeadCodeElimination": 0.003401517868041992,
|
| 662 |
+
"DeadStoreElimination": 0.08969426155090332,
|
| 663 |
+
"DelinearIndices": 0.020795345306396484,
|
| 664 |
+
"Delinearization": 0.006405353546142578,
|
| 665 |
+
"DelinearizeSPMD": 0.031574249267578125,
|
| 666 |
+
"DoNothing": 0.00010728836059570313,
|
| 667 |
+
"DramToDramTranspose": 0.021518468856811523,
|
| 668 |
+
"DumpGraphAndMetadata": 0.00677490234375,
|
| 669 |
+
"EliminateDivs": 0.0029458999633789063,
|
| 670 |
+
"ExpandBatchNorm": 0.003565549850463867,
|
| 671 |
+
"ExpandISAMacro": 0.006104230880737305,
|
| 672 |
+
"FactorizeBlkDims": 0.03833317756652832,
|
| 673 |
+
"FactorizeThreadAxesInFreeDims": 0.007614850997924805,
|
| 674 |
+
"FlattenMacroLoop": 0.01127004623413086,
|
| 675 |
+
"GenericAccessSimplifier": 0.0043070316314697266,
|
| 676 |
+
"InferInitValue": 0.06825661659240723,
|
| 677 |
+
"InferIntrinsicOnCC": 0.046250104904174805,
|
| 678 |
+
"InferNeuronTensor": 0.09652161598205566,
|
| 679 |
+
"InferNonlocalTensors": 0.08535599708557129,
|
| 680 |
+
"InferPSumTensor": 0.08618307113647461,
|
| 681 |
+
"InferShardAxis": 0.6054186820983887,
|
| 682 |
+
"InferSharedMemLoc": 0.007490873336791992,
|
| 683 |
+
"InlineNativeKernels": 0.0046694278717041016,
|
| 684 |
+
"InsertCoreBarrier": 0.00831913948059082,
|
| 685 |
+
"InsertIOTransposes": 0.07386589050292969,
|
| 686 |
+
"InsertImplicitShardAxisBeforeISel": 0.012522697448730469,
|
| 687 |
+
"InsertLocalTransposes": 0.018398761749267578,
|
| 688 |
+
"InsertOffloadedTransposes": 0.03478860855102539,
|
| 689 |
+
"LICM": 0.006189107894897461,
|
| 690 |
+
"LateLegalizeInst": 0.018419742584228516,
|
| 691 |
+
"LateLegalizePostSplit": 0.011380195617675781,
|
| 692 |
+
"LateLowerReshapeOp": 0.006206035614013672,
|
| 693 |
+
"LateLowerTensorOp": 0.006627559661865234,
|
| 694 |
+
"LateNeuronInstComb": 0.013695240020751953,
|
| 695 |
+
"LayoutPreprocessing": 0.08205723762512207,
|
| 696 |
+
"LayoutPreprocessingAndAnalysis": 0.3778700828552246,
|
| 697 |
+
"LayoutRequirementAnalysis": 0.027397871017456055,
|
| 698 |
+
"LegalizeCCOpLayout": 0.004743337631225586,
|
| 699 |
+
"LegalizeOpLevelAlias": 0.001989126205444336,
|
| 700 |
+
"LegalizePartitionReduce": 0.003030061721801758,
|
| 701 |
+
"LegalizeSundaAccess": 0.026180505752563477,
|
| 702 |
+
"LegalizeSundaMacro": 0.02354145050048828,
|
| 703 |
+
"LegalizeType": 0.012012004852294922,
|
| 704 |
+
"LocalLayoutOpt": 0.09747910499572754,
|
| 705 |
+
"LoopFusion": 0.011905670166015625,
|
| 706 |
+
"LoopSplitting": 0.005662441253662109,
|
| 707 |
+
"LowerBroadcast": 0.0031082630157470703,
|
| 708 |
+
"LowerCCOpBlockAxis": 0.015021800994873047,
|
| 709 |
+
"LowerComplexBroadcast": 0.004594564437866211,
|
| 710 |
+
"LowerIntrinsics": 0.061724185943603516,
|
| 711 |
+
"LowerShardAxis": 0.01390695571899414,
|
| 712 |
+
"LowerTensorOp": 0.032297372817993164,
|
| 713 |
+
"LowerToSendRecv": 0.005787849426269531,
|
| 714 |
+
"LowerTranspose": 0.014832496643066406,
|
| 715 |
+
"MacroGeneration": 0.17066407203674316,
|
| 716 |
+
"MaskPropagation": 0.004767894744873047,
|
| 717 |
+
"MemcpyElimination": 0.3223605155944824,
|
| 718 |
+
"MutateDataType": 0.0023605823516845703,
|
| 719 |
+
"NeuronAliasDependencyInduction": 0.0017361640930175781,
|
| 720 |
+
"NeuronAliasDependencyReset": 0.02784562110900879,
|
| 721 |
+
"NeuronInstComb": 0.008632659912109375,
|
| 722 |
+
"NeuronLICM": 0.01805901527404785,
|
| 723 |
+
"NeuronLoopFusion": 0.041216135025024414,
|
| 724 |
+
"NeuronLoopInterchange": 0.0041141510009765625,
|
| 725 |
+
"NeuronSimplifier": 0.025291919708251953,
|
| 726 |
+
"NeuronSimplifyPredicates": 0.007104635238647461,
|
| 727 |
+
"NeuronValueNumbering": 0.0058324337005615234,
|
| 728 |
+
"OptimizeAliasedCopyChain": 0.0016317367553710938,
|
| 729 |
+
"OptimizeNKIKernels": 0.4839596748352051,
|
| 730 |
+
"PAGLayoutOpt": 0.3772914409637451,
|
| 731 |
+
"PComputeCutting": 0.03927016258239746,
|
| 732 |
+
"PGLayoutTilingPipeline": 2.7096974849700928,
|
| 733 |
+
"PGTiling": 0.5330896377563477,
|
| 734 |
+
"PadElimination": 0.0010271072387695313,
|
| 735 |
+
"ParAxesAnnotation": 0.32303333282470703,
|
| 736 |
+
"PartialLoopFusion": 0.05098128318786621,
|
| 737 |
+
"PartialSimdFusion": 0.10409116744995117,
|
| 738 |
+
"PerfectLoopNest": 0.008025884628295898,
|
| 739 |
+
"RecognizeOpIdiom": 0.014155864715576172,
|
| 740 |
+
"Recompute": 0.0006039142608642578,
|
| 741 |
+
"RelaxPredicates": 0.007999897003173828,
|
| 742 |
+
"Rematerialization": 0.0150146484375,
|
| 743 |
+
"RemoveShardedPartitionAxes": 0.04702639579772949,
|
| 744 |
+
"ReshapeWeights": 0.0015103816986083984,
|
| 745 |
+
"ResolveAccessConflict": 0.0074825286865234375,
|
| 746 |
+
"ResolveComplicatePredicates": 0.002012014389038086,
|
| 747 |
+
"RewriteReplicationMatmul": 0.002730846405029297,
|
| 748 |
+
"RewriteWeights": 0.01182103157043457,
|
| 749 |
+
"SFKVectorizer": 0.4407639503479004,
|
| 750 |
+
"ShardingPropagationAnalysis": 0.029230833053588867,
|
| 751 |
+
"SimpleAllReduceTiling": 0.005069255828857422,
|
| 752 |
+
"Simplifier": 0.020698070526123047,
|
| 753 |
+
"SimplifyMacroPredicates": 0.021116018295288086,
|
| 754 |
+
"SimplifyNeuronTensor": 0.012060403823852539,
|
| 755 |
+
"SimplifySlice": 0.0015597343444824219,
|
| 756 |
+
"SimplifyTensor": 0.014514684677124023,
|
| 757 |
+
"SpillPSum": 0.048569679260253906,
|
| 758 |
+
"SplitAPUnionSets": 0.05286097526550293,
|
| 759 |
+
"SplitAccGrp": 0.002934694290161133,
|
| 760 |
+
"StaticProfiler": 0.013947248458862305,
|
| 761 |
+
"StaticTransposeLocalTensor": 0.00755763053894043,
|
| 762 |
+
"SundaISel": 0.06808805465698242,
|
| 763 |
+
"TCTransform": 0.0025751590728759766,
|
| 764 |
+
"TensorInitialization": 0.005185127258300781,
|
| 765 |
+
"TensorOpSimplifier": 0.024057626724243164,
|
| 766 |
+
"TensorOpTransform": 0.06213688850402832,
|
| 767 |
+
"TileCCOps": 0.025543689727783203,
|
| 768 |
+
"TilingProfiler": 0.02153778076171875,
|
| 769 |
+
"TransformConvOp": 0.007241010665893555,
|
| 770 |
+
"TritiumFusion": 0.1687297821044922,
|
| 771 |
+
"ValueNumbering": 0.009909868240356445,
|
| 772 |
+
"VectorizeDMA": 0.008072137832641602,
|
| 773 |
+
"VectorizeMatMult": 0.042955636978149414,
|
| 774 |
+
"WeightCoalescing": 0.003875732421875,
|
| 775 |
+
"ZeroSizeTensorElimination": 0.00020575523376464844
|
| 776 |
+
},
|
| 777 |
+
"tensorizer": {
|
| 778 |
+
"DMATilingProfiler::TotalInstructionsAfterTiling": 8283.0,
|
| 779 |
+
"StaticProfiler::AifUb": 502.6534729003906,
|
| 780 |
+
"StaticProfiler::ArithmeticIntensityTensorizer": 413.67962646484375,
|
| 781 |
+
"StaticProfiler::AverageDmaLength": 2481.933349609375,
|
| 782 |
+
"StaticProfiler::AverageFractalPeUtilization": 100.0,
|
| 783 |
+
"StaticProfiler::AveragePartitionUtilization": 99.62867736816406,
|
| 784 |
+
"StaticProfiler::AveragePeUtilization": 100.0,
|
| 785 |
+
"StaticProfiler::DDRTransferBytes": 266536960.0,
|
| 786 |
+
"StaticProfiler::InternalTransferBytes": 79167488.0,
|
| 787 |
+
"StaticProfiler::LoadExpanded": 71809.0,
|
| 788 |
+
"StaticProfiler::LocalizationEfficiency": 82.29916381835938,
|
| 789 |
+
"StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 103.46524047851563,
|
| 790 |
+
"StaticProfiler::StoreExpanded": 18433.0,
|
| 791 |
+
"StaticProfiler::TotalDMAExpanded": 90242.0,
|
| 792 |
+
"StaticProfiler::TotalDynamicInstancesCount": 9699.0,
|
| 793 |
+
"StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 9699.0,
|
| 794 |
+
"StaticProfiler::TotalLNCComm": 0.0,
|
| 795 |
+
"StaticProfiler::TotalLNCCommTransfer": 0.0,
|
| 796 |
+
"TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0,
|
| 797 |
+
"TilingProfiler::AveragePeUtilizationAfterTiling": 0.0,
|
| 798 |
+
"TilingProfiler::BatchnormInstructionsAfterTiling": 0.0,
|
| 799 |
+
"TilingProfiler::DmaInstructionsAfterTiling": 0.0,
|
| 800 |
+
"TilingProfiler::GenericInstructionsAfterTiling": 128.0,
|
| 801 |
+
"TilingProfiler::MatMultInstructionsAfterTiling": 6144.0,
|
| 802 |
+
"TilingProfiler::NumPfTransposes": 8.0,
|
| 803 |
+
"TilingProfiler::NumPfTransposesForIo": 3.0,
|
| 804 |
+
"TilingProfiler::NumPfTransposesForLocal": 3.0,
|
| 805 |
+
"TilingProfiler::NumPfTransposesForNonlocal": 2.0,
|
| 806 |
+
"TilingProfiler::PfTransposeInstructions": 992.0,
|
| 807 |
+
"TilingProfiler::PfTransposeInstructionsForIo": 288.0,
|
| 808 |
+
"TilingProfiler::PfTransposeInstructionsForLocal": 192.0,
|
| 809 |
+
"TilingProfiler::PfTransposeInstructionsForNonlocal": 512.0,
|
| 810 |
+
"TilingProfiler::ReduceInstructionsAfterTiling": 0.0,
|
| 811 |
+
"TilingProfiler::SimdInstructionsAfterTiling": 547.0,
|
| 812 |
+
"TilingProfiler::TotalInstructionsAfterTiling": 0.0,
|
| 813 |
+
"TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0,
|
| 814 |
+
"TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0,
|
| 815 |
+
"TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0,
|
| 816 |
+
"TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0,
|
| 817 |
+
"TransformConvOp::conv2d_column_packing": 0.0,
|
| 818 |
+
"TransformConvOp::conv2d_column_packing_1": 0.0,
|
| 819 |
+
"TransformConvOp::conv2d_column_packing_io10": 0.0,
|
| 820 |
+
"TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0
|
| 821 |
+
}
|
| 822 |
+
},
|
| 823 |
+
"sg0002": {
|
| 824 |
+
"compiletime": {
|
| 825 |
+
"AGOrderingAnalysisPass": 0.04760026931762695,
|
| 826 |
+
"AffinePredicateResolution": 0.003319978713989258,
|
| 827 |
+
"AliasDependencyElimination": 0.0002167224884033203,
|
| 828 |
+
"AliasDependencyInduction": 0.008548259735107422,
|
| 829 |
+
"AliasDependencyReset": 0.03149843215942383,
|
| 830 |
+
"BFComputeCutting": 0.00810694694519043,
|
| 831 |
+
"BirCodeGenLoop": 0.2911098003387451,
|
| 832 |
+
"CCOpFusion": 0.08548593521118164,
|
| 833 |
+
"CanonicalizeDAGForPGTiling": 0.007600545883178711,
|
| 834 |
+
"CanonicalizeIR": 0.0030400753021240234,
|
| 835 |
+
"CoalesceCCOp": 0.008062601089477539,
|
| 836 |
+
"CommuteConcat": 0.007961034774780273,
|
| 837 |
+
"DMALocalityOpt": 0.002327442169189453,
|
| 838 |
+
"DMAProfiler": 0.009556293487548828,
|
| 839 |
+
"DMATilingProfiler": 0.009016752243041992,
|
| 840 |
+
"DataLocalityOpt": 0.17029356956481934,
|
| 841 |
+
"DataStreaming": 0.007345914840698242,
|
| 842 |
+
"DeConcat": 0.01120138168334961,
|
| 843 |
+
"DeadCodeElimination": 0.010882377624511719,
|
| 844 |
+
"DeadStoreElimination": 0.010195016860961914,
|
| 845 |
+
"DelinearIndices": 0.010077953338623047,
|
| 846 |
+
"Delinearization": 0.011870861053466797,
|
| 847 |
+
"DelinearizeSPMD": 0.035944223403930664,
|
| 848 |
+
"DoNothing": 0.0001087188720703125,
|
| 849 |
+
"DramToDramTranspose": 0.013046979904174805,
|
| 850 |
+
"DumpGraphAndMetadata": 0.03416705131530762,
|
| 851 |
+
"EliminateDivs": 0.004259586334228516,
|
| 852 |
+
"ExpandBatchNorm": 0.0017371177673339844,
|
| 853 |
+
"ExpandISAMacro": 0.0058269500732421875,
|
| 854 |
+
"FactorizeBlkDims": 0.03687334060668945,
|
| 855 |
+
"FactorizeThreadAxesInFreeDims": 0.00911855697631836,
|
| 856 |
+
"FlattenMacroLoop": 0.0048520565032958984,
|
| 857 |
+
"GenericAccessSimplifier": 0.001367330551147461,
|
| 858 |
+
"InferInitValue": 0.0836641788482666,
|
| 859 |
+
"InferIntrinsicOnCC": 0.008740901947021484,
|
| 860 |
+
"InferNeuronTensor": 0.05709338188171387,
|
| 861 |
+
"InferNonlocalTensors": 0.041548728942871094,
|
| 862 |
+
"InferPSumTensor": 0.05230545997619629,
|
| 863 |
+
"InferShardAxis": 0.5781030654907227,
|
| 864 |
+
"InferSharedMemLoc": 0.026081323623657227,
|
| 865 |
+
"InlineNativeKernels": 0.002477407455444336,
|
| 866 |
+
"InsertCoreBarrier": 0.008142948150634766,
|
| 867 |
+
"InsertIOTransposes": 0.039937734603881836,
|
| 868 |
+
"InsertImplicitShardAxisBeforeISel": 0.013466596603393555,
|
| 869 |
+
"InsertLocalTransposes": 0.018125534057617188,
|
| 870 |
+
"InsertOffloadedTransposes": 0.014874696731567383,
|
| 871 |
+
"LICM": 0.0058231353759765625,
|
| 872 |
+
"LateLegalizeInst": 0.01174783706665039,
|
| 873 |
+
"LateLegalizePostSplit": 0.02429652214050293,
|
| 874 |
+
"LateLowerReshapeOp": 0.0018832683563232422,
|
| 875 |
+
"LateLowerTensorOp": 0.0021920204162597656,
|
| 876 |
+
"LateNeuronInstComb": 0.043119192123413086,
|
| 877 |
+
"LayoutPreprocessing": 0.06973385810852051,
|
| 878 |
+
"LayoutPreprocessingAndAnalysis": 0.11140203475952148,
|
| 879 |
+
"LayoutRequirementAnalysis": 0.013022661209106445,
|
| 880 |
+
"LegalizeCCOpLayout": 0.0020427703857421875,
|
| 881 |
+
"LegalizeOpLevelAlias": 0.0016918182373046875,
|
| 882 |
+
"LegalizePartitionReduce": 0.0030241012573242188,
|
| 883 |
+
"LegalizeSundaAccess": 0.045601606369018555,
|
| 884 |
+
"LegalizeSundaMacro": 0.02708148956298828,
|
| 885 |
+
"LegalizeType": 0.014174222946166992,
|
| 886 |
+
"LocalLayoutOpt": 0.022045135498046875,
|
| 887 |
+
"LoopFusion": 0.029404163360595703,
|
| 888 |
+
"LoopSplitting": 0.0007355213165283203,
|
| 889 |
+
"LowerBroadcast": 0.005047321319580078,
|
| 890 |
+
"LowerCCOpBlockAxis": 0.007714748382568359,
|
| 891 |
+
"LowerComplexBroadcast": 0.005654096603393555,
|
| 892 |
+
"LowerIntrinsics": 0.04253792762756348,
|
| 893 |
+
"LowerShardAxis": 0.03305673599243164,
|
| 894 |
+
"LowerTensorOp": 0.028458356857299805,
|
| 895 |
+
"LowerToSendRecv": 0.03391242027282715,
|
| 896 |
+
"LowerTranspose": 0.04655814170837402,
|
| 897 |
+
"MacroGeneration": 0.06428074836730957,
|
| 898 |
+
"MaskPropagation": 0.0036263465881347656,
|
| 899 |
+
"MemcpyElimination": 0.05451250076293945,
|
| 900 |
+
"MutateDataType": 0.001516103744506836,
|
| 901 |
+
"NeuronAliasDependencyInduction": 0.0005834102630615234,
|
| 902 |
+
"NeuronAliasDependencyReset": 0.022034168243408203,
|
| 903 |
+
"NeuronInstComb": 0.04628133773803711,
|
| 904 |
+
"NeuronLICM": 0.026567935943603516,
|
| 905 |
+
"NeuronLoopFusion": 0.07339620590209961,
|
| 906 |
+
"NeuronLoopInterchange": 0.0027348995208740234,
|
| 907 |
+
"NeuronSimplifier": 0.021918296813964844,
|
| 908 |
+
"NeuronSimplifyPredicates": 0.014072179794311523,
|
| 909 |
+
"NeuronValueNumbering": 0.013863325119018555,
|
| 910 |
+
"OptimizeAliasedCopyChain": 0.0008976459503173828,
|
| 911 |
+
"OptimizeNKIKernels": 4.611967086791992,
|
| 912 |
+
"PAGLayoutOpt": 0.2917053699493408,
|
| 913 |
+
"PComputeCutting": 0.008776664733886719,
|
| 914 |
+
"PGLayoutTilingPipeline": 1.8517823219299316,
|
| 915 |
+
"PGTiling": 0.26313185691833496,
|
| 916 |
+
"PadElimination": 0.0006458759307861328,
|
| 917 |
+
"ParAxesAnnotation": 0.188338041305542,
|
| 918 |
+
"PartialLoopFusion": 0.05682229995727539,
|
| 919 |
+
"PartialSimdFusion": 0.0237729549407959,
|
| 920 |
+
"PerfectLoopNest": 0.00557398796081543,
|
| 921 |
+
"RecognizeOpIdiom": 0.008669376373291016,
|
| 922 |
+
"Recompute": 0.0005908012390136719,
|
| 923 |
+
"RelaxPredicates": 0.006473541259765625,
|
| 924 |
+
"Rematerialization": 0.011237144470214844,
|
| 925 |
+
"RemoveShardedPartitionAxes": 0.014671802520751953,
|
| 926 |
+
"ReshapeWeights": 0.0018546581268310547,
|
| 927 |
+
"ResolveAccessConflict": 0.008959770202636719,
|
| 928 |
+
"ResolveComplicatePredicates": 0.0009264945983886719,
|
| 929 |
+
"RewriteReplicationMatmul": 0.0037200450897216797,
|
| 930 |
+
"RewriteWeights": 0.008005380630493164,
|
| 931 |
+
"SFKVectorizer": 0.1923050880432129,
|
| 932 |
+
"ShardingPropagationAnalysis": 0.10689902305603027,
|
| 933 |
+
"SimpleAllReduceTiling": 0.003542184829711914,
|
| 934 |
+
"Simplifier": 0.00808858871459961,
|
| 935 |
+
"SimplifyMacroPredicates": 0.031823158264160156,
|
| 936 |
+
"SimplifyNeuronTensor": 0.013367414474487305,
|
| 937 |
+
"SimplifySlice": 0.001531362533569336,
|
| 938 |
+
"SimplifyTensor": 0.018309593200683594,
|
| 939 |
+
"SpillPSum": 0.03448653221130371,
|
| 940 |
+
"SplitAPUnionSets": 0.09693408012390137,
|
| 941 |
+
"SplitAccGrp": 0.0025701522827148438,
|
| 942 |
+
"StaticProfiler": 0.04053521156311035,
|
| 943 |
+
"StaticTransposeLocalTensor": 0.012635231018066406,
|
| 944 |
+
"SundaISel": 0.10333561897277832,
|
| 945 |
+
"TCTransform": 0.006776332855224609,
|
| 946 |
+
"TensorInitialization": 0.011014938354492188,
|
| 947 |
+
"TensorOpSimplifier": 0.005452632904052734,
|
| 948 |
+
"TensorOpTransform": 0.033481597900390625,
|
| 949 |
+
"TileCCOps": 0.011636972427368164,
|
| 950 |
+
"TilingProfiler": 0.024947643280029297,
|
| 951 |
+
"TransformConvOp": 0.013001441955566406,
|
| 952 |
+
"TritiumFusion": 0.1458723545074463,
|
| 953 |
+
"ValueNumbering": 0.003311634063720703,
|
| 954 |
+
"VectorizeDMA": 0.005986928939819336,
|
| 955 |
+
"VectorizeMatMult": 0.028806686401367188,
|
| 956 |
+
"WeightCoalescing": 0.007086515426635742,
|
| 957 |
+
"ZeroSizeTensorElimination": 0.00017833709716796875
|
| 958 |
+
},
|
| 959 |
+
"tensorizer": {
|
| 960 |
+
"DMATilingProfiler::TotalInstructionsAfterTiling": 25519.0,
|
| 961 |
+
"StaticProfiler::AifUb": 337.1839904785156,
|
| 962 |
+
"StaticProfiler::ArithmeticIntensityTensorizer": 248.63792419433594,
|
| 963 |
+
"StaticProfiler::AverageDmaLength": 2413.602294921875,
|
| 964 |
+
"StaticProfiler::AverageFractalPeUtilization": 98.93502807617188,
|
| 965 |
+
"StaticProfiler::AveragePartitionUtilization": 95.0970230102539,
|
| 966 |
+
"StaticProfiler::AveragePeUtilization": 97.18069458007813,
|
| 967 |
+
"StaticProfiler::DDRTransferBytes": 495991840.0,
|
| 968 |
+
"StaticProfiler::InternalTransferBytes": 361682720.0,
|
| 969 |
+
"StaticProfiler::LoadExpanded": 133728.0,
|
| 970 |
+
"StaticProfiler::LocalizationEfficiency": 73.73954010009766,
|
| 971 |
+
"StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 79.92718505859375,
|
| 972 |
+
"StaticProfiler::StoreExpanded": 7530.0,
|
| 973 |
+
"StaticProfiler::TotalDMAExpanded": 141258.0,
|
| 974 |
+
"StaticProfiler::TotalDynamicInstancesCount": 30781.0,
|
| 975 |
+
"StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 30330.0,
|
| 976 |
+
"StaticProfiler::TotalLNCComm": 0.0,
|
| 977 |
+
"StaticProfiler::TotalLNCCommTransfer": 0.0,
|
| 978 |
+
"TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0,
|
| 979 |
+
"TilingProfiler::AveragePeUtilizationAfterTiling": 0.0,
|
| 980 |
+
"TilingProfiler::BatchnormInstructionsAfterTiling": 0.0,
|
| 981 |
+
"TilingProfiler::DmaInstructionsAfterTiling": 0.0,
|
| 982 |
+
"TilingProfiler::GenericInstructionsAfterTiling": 4.0,
|
| 983 |
+
"TilingProfiler::MatMultInstructionsAfterTiling": 14112.0,
|
| 984 |
+
"TilingProfiler::NumPfTransposes": 5.0,
|
| 985 |
+
"TilingProfiler::NumPfTransposesForIo": 1.0,
|
| 986 |
+
"TilingProfiler::NumPfTransposesForLocal": 1.0,
|
| 987 |
+
"TilingProfiler::NumPfTransposesForNonlocal": 3.0,
|
| 988 |
+
"TilingProfiler::PfTransposeInstructions": 10273.0,
|
| 989 |
+
"TilingProfiler::PfTransposeInstructionsForIo": 9504.0,
|
| 990 |
+
"TilingProfiler::PfTransposeInstructionsForLocal": 1.0,
|
| 991 |
+
"TilingProfiler::PfTransposeInstructionsForNonlocal": 768.0,
|
| 992 |
+
"TilingProfiler::ReduceInstructionsAfterTiling": 10.0,
|
| 993 |
+
"TilingProfiler::SimdInstructionsAfterTiling": 311.0,
|
| 994 |
+
"TilingProfiler::TotalInstructionsAfterTiling": 0.0,
|
| 995 |
+
"TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0,
|
| 996 |
+
"TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0,
|
| 997 |
+
"TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0,
|
| 998 |
+
"TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0,
|
| 999 |
+
"TransformConvOp::conv2d_column_packing": 0.0,
|
| 1000 |
+
"TransformConvOp::conv2d_column_packing_1": 0.0,
|
| 1001 |
+
"TransformConvOp::conv2d_column_packing_io10": 0.0,
|
| 1002 |
+
"TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0
|
| 1003 |
+
}
|
| 1004 |
+
},
|
| 1005 |
+
"sg01": {
|
| 1006 |
+
"compiletime": {
|
| 1007 |
+
"CanonicalizeConv": 1.8000000636675395e-05,
|
| 1008 |
+
"CanonicalizeForTensorizer": 1.8000000636675395e-05,
|
| 1009 |
+
"Canonicalizer": 0.0003330000035930425,
|
| 1010 |
+
"HoistCompute": 3.000000106112566e-06,
|
| 1011 |
+
"IdentifyCrossPassTensors": 1.5999999959603883e-05,
|
| 1012 |
+
"MemcastMotion": 7.000000096013537e-06,
|
| 1013 |
+
"PenguinizeFunctions": 1.8000000636675395e-05,
|
| 1014 |
+
"PruneFunctions": 1.8000000636675395e-05,
|
| 1015 |
+
"RemoveOptimizationBarriers": 2.4000000848900527e-05,
|
| 1016 |
+
"ScatterMotion": 7.000000096013537e-06,
|
| 1017 |
+
"TensorizerLegalizationPass": 2.300000051036477e-05,
|
| 1018 |
+
"VerifySupportedOps": 1.5999999959603883e-05,
|
| 1019 |
+
"algsimp": 9.899999713525176e-05,
|
| 1020 |
+
"batchnorm_expander": 1.5999999959603883e-05,
|
| 1021 |
+
"boundary-marker-removal": 7.000000096013537e-06,
|
| 1022 |
+
"call-inliner": 1.4000000192027073e-05,
|
| 1023 |
+
"canonicalize-boundary-marker": 7.999999979801942e-06,
|
| 1024 |
+
"collective-stream-id-checker": 3.999999989900971e-06,
|
| 1025 |
+
"comparison-expander": 7.999999979801942e-06,
|
| 1026 |
+
"computation-deduplicator": 2.700000004551839e-05,
|
| 1027 |
+
"config-lowering": 4.999999873689376e-05,
|
| 1028 |
+
"constant_folding": 1.4000000192027073e-05,
|
| 1029 |
+
"cse": 1.8000000636675395e-05,
|
| 1030 |
+
"dce": 9.999999974752427e-07,
|
| 1031 |
+
"dynamic-slice-transpose": 6.000000212225132e-06,
|
| 1032 |
+
"eliminate-redundant-compare": 4.999999873689376e-06,
|
| 1033 |
+
"emit-offloaded-dropout": 1.5999999959603883e-05,
|
| 1034 |
+
"flatten-call-graph": 1.1000000085914508e-05,
|
| 1035 |
+
"fuse-send-recv": 2.9000000722589903e-05,
|
| 1036 |
+
"hilo-conditional-to-select": 9.000000318337698e-06,
|
| 1037 |
+
"hilo::LegalizeAlias": 6.000000212225132e-06,
|
| 1038 |
+
"hilo::NeuronInstCombine": 5.400000009103678e-05,
|
| 1039 |
+
"hilo::NeuronOpFusion": 1.2000000424450263e-05,
|
| 1040 |
+
"hilo::ReplaceTokenTypeWithU8Pass": 1.8000000636675395e-05,
|
| 1041 |
+
"hilo::ScheduleFusion": 9.999999974752427e-07,
|
| 1042 |
+
"hilo::SixtyFourHack": 1.5999999959603883e-05,
|
| 1043 |
+
"hilo::VerifyAliasing": 1.9999999949504854e-06,
|
| 1044 |
+
"hlo-mac-count": 0.00012700000661425292,
|
| 1045 |
+
"legalize-ccops-for-tensorizer": 9.999999974752427e-07,
|
| 1046 |
+
"legalize-compare": 4.999999873689376e-06,
|
| 1047 |
+
"lower-argminmax-custom-call": 4.999999873689376e-06,
|
| 1048 |
+
"map-inline": 1.4000000192027073e-05,
|
| 1049 |
+
"metadata-naming": 2.9000000722589903e-05,
|
| 1050 |
+
"mlir::detail::OpToOpPassAdaptor": 2.499999936844688e-05,
|
| 1051 |
+
"mlir::hlo::MhloToPyPenguin": 0.0017209999496117234,
|
| 1052 |
+
"mlir::mhlo::LowerComplexExtraPass": 7.200000254670158e-05,
|
| 1053 |
+
"mlir::mhlo::LowerComplexPass": 0.00014099999680183828,
|
| 1054 |
+
"native-to-custom-softmax": 7.000000096013537e-06,
|
| 1055 |
+
"native-to-custom-softmax-dx": 2.300000051036477e-05,
|
| 1056 |
+
"neuron-hlo-verifier": 0.0005729999975301325,
|
| 1057 |
+
"operand_upcaster": 1.8999999156221747e-05,
|
| 1058 |
+
"post-par-pipe-begin": 9.999999974752427e-07,
|
| 1059 |
+
"post-par-pipe-end": 0.0,
|
| 1060 |
+
"post-partition-simplification": 0.0007699999841861427,
|
| 1061 |
+
"replace-minimum-constant": 9.000000318337698e-06,
|
| 1062 |
+
"reshape-mover": 3.999999989900971e-06,
|
| 1063 |
+
"simplify-concat": 4.8999998398358e-05,
|
| 1064 |
+
"simplify-while-loops": 3.000000106112566e-06,
|
| 1065 |
+
"transform-variadic-reduce": 1.1000000085914508e-05,
|
| 1066 |
+
"tuple-simplifier": 7.000000096013537e-06,
|
| 1067 |
+
"unpack-nested-aws-ntwsr": 4.999999873689376e-06,
|
| 1068 |
+
"unroll-while-loop": 2.099999983329326e-05
|
| 1069 |
+
},
|
| 1070 |
+
"hilo": {
|
| 1071 |
+
"ArithmeticIntensity": 661.1749267578125,
|
| 1072 |
+
"HloMacCount": 55834574848.0,
|
| 1073 |
+
"Traffic": 168895008.0
|
| 1074 |
+
}
|
| 1075 |
+
},
|
| 1076 |
+
"sg02": {
|
| 1077 |
+
"compiletime": {
|
| 1078 |
+
"CanonicalizeConv": 7.000000096013537e-06,
|
| 1079 |
+
"CanonicalizeForTensorizer": 1.9999999494757503e-05,
|
| 1080 |
+
"Canonicalizer": 0.0004039999912492931,
|
| 1081 |
+
"HoistCompute": 0.0,
|
| 1082 |
+
"IdentifyCrossPassTensors": 1.9999999494757503e-05,
|
| 1083 |
+
"MemcastMotion": 0.0,
|
| 1084 |
+
"PenguinizeFunctions": 1.8000000636675395e-05,
|
| 1085 |
+
"PruneFunctions": 9.999999747378752e-06,
|
| 1086 |
+
"RemoveOptimizationBarriers": 2.499999936844688e-05,
|
| 1087 |
+
"ScatterMotion": 1.9999999949504854e-06,
|
| 1088 |
+
"TensorizerLegalizationPass": 9.000000318337698e-06,
|
| 1089 |
+
"VerifySupportedOps": 1.8000000636675395e-05,
|
| 1090 |
+
"algsimp": 0.00012599999899975955,
|
| 1091 |
+
"batchnorm_expander": 1.700000029813964e-05,
|
| 1092 |
+
"boundary-marker-removal": 6.000000212225132e-06,
|
| 1093 |
+
"call-inliner": 1.8999999156221747e-05,
|
| 1094 |
+
"canonicalize-boundary-marker": 7.000000096013537e-06,
|
| 1095 |
+
"collective-stream-id-checker": 3.999999989900971e-06,
|
| 1096 |
+
"comparison-expander": 2.9999999242136255e-05,
|
| 1097 |
+
"computation-deduplicator": 3.199999991920777e-05,
|
| 1098 |
+
"config-lowering": 6.500000017695129e-05,
|
| 1099 |
+
"constant_folding": 1.700000029813964e-05,
|
| 1100 |
+
"cse": 2.499999936844688e-05,
|
| 1101 |
+
"dce": 1.9999999949504854e-06,
|
| 1102 |
+
"dynamic-slice-transpose": 7.000000096013537e-06,
|
| 1103 |
+
"eliminate-redundant-compare": 4.999999873689376e-06,
|
| 1104 |
+
"emit-offloaded-dropout": 1.9999999494757503e-05,
|
| 1105 |
+
"flatten-call-graph": 1.700000029813964e-05,
|
| 1106 |
+
"fuse-send-recv": 3.899999865097925e-05,
|
| 1107 |
+
"hilo-conditional-to-select": 1.2000000424450263e-05,
|
| 1108 |
+
"hilo::LegalizeAlias": 3.000000106112566e-06,
|
| 1109 |
+
"hilo::NeuronInstCombine": 1.4000000192027073e-05,
|
| 1110 |
+
"hilo::NeuronOpFusion": 1.8000000636675395e-05,
|
| 1111 |
+
"hilo::ReplaceTokenTypeWithU8Pass": 2.099999983329326e-05,
|
| 1112 |
+
"hilo::ScheduleFusion": 9.999999974752427e-07,
|
| 1113 |
+
"hilo::SixtyFourHack": 6.199999916134402e-05,
|
| 1114 |
+
"hilo::VerifyAliasing": 9.999999974752427e-07,
|
| 1115 |
+
"hlo-mac-count": 0.007534000091254711,
|
| 1116 |
+
"legalize-ccops-for-tensorizer": 1.9999999949504854e-06,
|
| 1117 |
+
"legalize-compare": 3.999999989900971e-06,
|
| 1118 |
+
"lower-argminmax-custom-call": 6.000000212225132e-06,
|
| 1119 |
+
"map-inline": 1.5999999959603883e-05,
|
| 1120 |
+
"metadata-naming": 2.499999936844688e-05,
|
| 1121 |
+
"mlir::detail::OpToOpPassAdaptor": 1.2999999853491317e-05,
|
| 1122 |
+
"mlir::hlo::MhloToPyPenguin": 0.008725999854505062,
|
| 1123 |
+
"mlir::mhlo::LowerComplexExtraPass": 7.899999764049426e-05,
|
| 1124 |
+
"mlir::mhlo::LowerComplexPass": 0.0001320000010309741,
|
| 1125 |
+
"native-to-custom-softmax": 6.000000212225132e-06,
|
| 1126 |
+
"native-to-custom-softmax-dx": 2.5999999706982635e-05,
|
| 1127 |
+
"neuron-hlo-verifier": 0.0005200000014156103,
|
| 1128 |
+
"operand_upcaster": 1.700000029813964e-05,
|
| 1129 |
+
"post-par-pipe-begin": 1.9999999949504854e-06,
|
| 1130 |
+
"post-par-pipe-end": 0.0,
|
| 1131 |
+
"post-partition-simplification": 0.00076299998909235,
|
| 1132 |
+
"replace-minimum-constant": 1.4999999621068127e-05,
|
| 1133 |
+
"reshape-mover": 4.999999873689376e-06,
|
| 1134 |
+
"simplify-concat": 5.199999941396527e-05,
|
| 1135 |
+
"simplify-while-loops": 3.999999989900971e-06,
|
| 1136 |
+
"transform-variadic-reduce": 7.100000220816582e-05,
|
| 1137 |
+
"tuple-simplifier": 7.999999979801942e-06,
|
| 1138 |
+
"unpack-nested-aws-ntwsr": 6.000000212225132e-06,
|
| 1139 |
+
"unroll-while-loop": 9.999999974752427e-07
|
| 1140 |
+
},
|
| 1141 |
+
"hilo": {
|
| 1142 |
+
"ArithmeticIntensity": 207.31654357910156,
|
| 1143 |
+
"HloMacCount": 38811336704.0,
|
| 1144 |
+
"Traffic": 374416192.0
|
| 1145 |
+
}
|
| 1146 |
+
},
|
| 1147 |
+
"topk": {
|
| 1148 |
+
"compiletime": {
|
| 1149 |
+
"CoalesceCCOp": 0.012049198150634766,
|
| 1150 |
+
"DMALocalityOpt": 0.013970613479614258,
|
| 1151 |
+
"DMAProfiler": 0.007668733596801758,
|
| 1152 |
+
"DataStreaming": 0.031991004943847656,
|
| 1153 |
+
"DoNothing": 0.0002658367156982422,
|
| 1154 |
+
"ExpandISAMacro": 0.007843732833862305,
|
| 1155 |
+
"FactorizeBlkDims": 0.03324699401855469,
|
| 1156 |
+
"InferPSumTensor": 0.17986130714416504,
|
| 1157 |
+
"InferSharedMemLoc": 0.0050508975982666016,
|
| 1158 |
+
"InsertCoreBarrier": 0.0073986053466796875,
|
| 1159 |
+
"LateLegalizeInst": 0.02333354949951172,
|
| 1160 |
+
"LateNeuronInstComb": 0.01965785026550293,
|
| 1161 |
+
"LegalizeSundaAccess": 0.0358271598815918,
|
| 1162 |
+
"LegalizeType": 0.026246309280395508,
|
| 1163 |
+
"LowerBroadcast": 0.023288965225219727,
|
| 1164 |
+
"LowerIntrinsics": 0.008131980895996094,
|
| 1165 |
+
"LowerTranspose": 0.004733085632324219,
|
| 1166 |
+
"NeuronInstComb": 0.01128530502319336,
|
| 1167 |
+
"NeuronLICM": 0.027612686157226563,
|
| 1168 |
+
"NeuronSimplifyPredicates": 0.006512641906738281,
|
| 1169 |
+
"NeuronValueNumbering": 0.007419109344482422,
|
| 1170 |
+
"SFKVectorizer": 0.09510302543640137,
|
| 1171 |
+
"SimpleAllReduceTiling": 0.0070476531982421875,
|
| 1172 |
+
"SimplifyNeuronTensor": 0.11079812049865723,
|
| 1173 |
+
"SpillPSum": 0.058808088302612305,
|
| 1174 |
+
"WeightCoalescing": 0.0070688724517822266
|
| 1175 |
+
}
|
| 1176 |
+
}
|
| 1177 |
+
}
|
context_encoding_model/_tp0_bk4/graph.neff
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:83bfd67384e1a0c5645609060b8bfb6fc5cfe3dbbd75b7568508606e623f387d
|
| 3 |
+
size 1926144
|
context_encoding_model/_tp0_bk4/log-neuron-cc.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
context_encoding_model/_tp0_bk4/metaneff.pb
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f2c9cfa0cd764e2b2f060557a0315ea75ce71a4875299aa863b7564b6f41b711
|
| 3 |
+
size 3644060
|
context_encoding_model/_tp0_bk4/model.MODULE_95ef7ca73cc0a6161be2+96be3c33.hlo_module.pb
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8c564e37d09483fd3fa5207db2f0d41a54a9993b618c3243e9e641c74a7d8a5c
|
| 3 |
+
size 3730846
|
context_encoding_model/_tp0_bk4/model.MODULE_95ef7ca73cc0a6161be2+96be3c33.neff
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:83bfd67384e1a0c5645609060b8bfb6fc5cfe3dbbd75b7568508606e623f387d
|
| 3 |
+
size 1926144
|
context_encoding_model/_tp0_bk4/neuron_config.json
ADDED
|
@@ -0,0 +1,224 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_attn_implementation_autoset": false,
|
| 3 |
+
"_name_or_path": "/home/ubuntu/models/Qwen3-1.7B",
|
| 4 |
+
"add_cross_attention": false,
|
| 5 |
+
"architectures": [
|
| 6 |
+
"Qwen3ForCausalLM"
|
| 7 |
+
],
|
| 8 |
+
"attention_bias": false,
|
| 9 |
+
"attention_dropout": 0.0,
|
| 10 |
+
"attribute_map": {},
|
| 11 |
+
"bad_words_ids": null,
|
| 12 |
+
"begin_suppress_tokens": null,
|
| 13 |
+
"bos_token_id": 151643,
|
| 14 |
+
"chunk_size_feed_forward": 0,
|
| 15 |
+
"cross_attention_hidden_size": null,
|
| 16 |
+
"decoder_start_token_id": null,
|
| 17 |
+
"diversity_penalty": 0.0,
|
| 18 |
+
"do_sample": false,
|
| 19 |
+
"early_stopping": false,
|
| 20 |
+
"encoder_no_repeat_ngram_size": 0,
|
| 21 |
+
"eos_token_id": 151645,
|
| 22 |
+
"exponential_decay_length_penalty": null,
|
| 23 |
+
"finetuning_task": null,
|
| 24 |
+
"forced_bos_token_id": null,
|
| 25 |
+
"forced_eos_token_id": null,
|
| 26 |
+
"fused_spec_config": null,
|
| 27 |
+
"head_dim": 128,
|
| 28 |
+
"hidden_act": "silu",
|
| 29 |
+
"hidden_size": 2048,
|
| 30 |
+
"id2label": {
|
| 31 |
+
"0": "LABEL_0",
|
| 32 |
+
"1": "LABEL_1"
|
| 33 |
+
},
|
| 34 |
+
"initializer_range": 0.02,
|
| 35 |
+
"intermediate_size": 6144,
|
| 36 |
+
"is_decoder": false,
|
| 37 |
+
"is_encoder_decoder": false,
|
| 38 |
+
"label2id": {
|
| 39 |
+
"LABEL_0": 0,
|
| 40 |
+
"LABEL_1": 1
|
| 41 |
+
},
|
| 42 |
+
"length_penalty": 1.0,
|
| 43 |
+
"max_length": 20,
|
| 44 |
+
"max_position_embeddings": 40960,
|
| 45 |
+
"max_window_layers": 28,
|
| 46 |
+
"metadata": null,
|
| 47 |
+
"min_length": 0,
|
| 48 |
+
"model_type": "qwen3",
|
| 49 |
+
"neuron_config": {
|
| 50 |
+
"activation_quantization_type": null,
|
| 51 |
+
"allow_input_truncation": false,
|
| 52 |
+
"apply_seq_ids_mask": false,
|
| 53 |
+
"async_mode": false,
|
| 54 |
+
"attention_dp_degree": 1,
|
| 55 |
+
"attention_dtype": null,
|
| 56 |
+
"attn_block_cte_nki_kernel_enabled": false,
|
| 57 |
+
"attn_block_tkg_nki_kernel_cache_update": false,
|
| 58 |
+
"attn_block_tkg_nki_kernel_cascaded_attention": false,
|
| 59 |
+
"attn_block_tkg_nki_kernel_enabled": false,
|
| 60 |
+
"attn_cls": {
|
| 61 |
+
"__module__": "neuronx_distributed_inference.models.qwen3.modeling_qwen3",
|
| 62 |
+
"__name__": "NeuronQwen3Attention"
|
| 63 |
+
},
|
| 64 |
+
"attn_kernel_enabled": null,
|
| 65 |
+
"attn_tkg_builtin_kernel_enabled": false,
|
| 66 |
+
"attn_tkg_nki_kernel_enabled": false,
|
| 67 |
+
"batch_size": 1,
|
| 68 |
+
"bucket_n_active_tokens": true,
|
| 69 |
+
"buckets": [
|
| 70 |
+
2048
|
| 71 |
+
],
|
| 72 |
+
"cast_type": "config",
|
| 73 |
+
"cc_pipeline_tiling_factor": 2,
|
| 74 |
+
"chunked_prefill_config": null,
|
| 75 |
+
"context_encoding_buckets": [
|
| 76 |
+
2048
|
| 77 |
+
],
|
| 78 |
+
"cp_degree": 1,
|
| 79 |
+
"ctx_batch_size": 1,
|
| 80 |
+
"disable_kv_cache_tiling": false,
|
| 81 |
+
"draft_model_modules_to_not_convert": null,
|
| 82 |
+
"enable_bucketing": true,
|
| 83 |
+
"enable_cte_modular_flow": false,
|
| 84 |
+
"enable_eagle_draft_input_norm": false,
|
| 85 |
+
"enable_eagle_speculation": false,
|
| 86 |
+
"enable_fused_speculation": false,
|
| 87 |
+
"enable_long_context_mode": false,
|
| 88 |
+
"enable_output_completion_notifications": false,
|
| 89 |
+
"enable_spill_reload_dge": false,
|
| 90 |
+
"enable_token_tree": false,
|
| 91 |
+
"ep_degree": 1,
|
| 92 |
+
"expert_mlp_nki_kernel_enabled": null,
|
| 93 |
+
"flash_decoding_enabled": false,
|
| 94 |
+
"fused_qkv": false,
|
| 95 |
+
"fused_rmsnorm_skip_gamma": false,
|
| 96 |
+
"is_block_kv_layout": null,
|
| 97 |
+
"is_chunked_prefill": false,
|
| 98 |
+
"is_continuous_batching": true,
|
| 99 |
+
"is_eagle_draft": false,
|
| 100 |
+
"is_medusa": false,
|
| 101 |
+
"is_prefill_stage": true,
|
| 102 |
+
"is_prefix_caching": false,
|
| 103 |
+
"k_cache_transposed": false,
|
| 104 |
+
"kv_cache_batch_size": 8,
|
| 105 |
+
"kv_cache_padding_size": 0,
|
| 106 |
+
"kv_cache_quant": false,
|
| 107 |
+
"kv_cache_tiling": false,
|
| 108 |
+
"layer_boundary_markers": false,
|
| 109 |
+
"lm_head_pad": true,
|
| 110 |
+
"lm_head_pad_alignment_size": 1,
|
| 111 |
+
"local_ranks_size": 2,
|
| 112 |
+
"logical_nc_config": 2,
|
| 113 |
+
"lora_config": null,
|
| 114 |
+
"max_batch_size": 8,
|
| 115 |
+
"max_context_length": 4096,
|
| 116 |
+
"max_length": 4096,
|
| 117 |
+
"max_new_tokens": null,
|
| 118 |
+
"medusa_speculation_length": 0,
|
| 119 |
+
"medusa_tree": null,
|
| 120 |
+
"mlp_kernel_enabled": false,
|
| 121 |
+
"mlp_kernel_fuse_residual_add": false,
|
| 122 |
+
"modules_to_not_convert": null,
|
| 123 |
+
"moe_fused_nki_kernel_enabled": null,
|
| 124 |
+
"n_active_tokens": 4096,
|
| 125 |
+
"n_positions": 4096,
|
| 126 |
+
"num_medusa_heads": 0,
|
| 127 |
+
"on_cpu": false,
|
| 128 |
+
"on_device_sampling_config": {
|
| 129 |
+
"deterministic": false,
|
| 130 |
+
"do_sample": false,
|
| 131 |
+
"dynamic": true,
|
| 132 |
+
"global_topk": 256,
|
| 133 |
+
"on_device_sampling_config": true,
|
| 134 |
+
"temperature": 1.0,
|
| 135 |
+
"top_k": 1,
|
| 136 |
+
"top_k_kernel_enabled": false,
|
| 137 |
+
"top_p": 1.0
|
| 138 |
+
},
|
| 139 |
+
"output_logits": false,
|
| 140 |
+
"overrides_torch_dtype": true,
|
| 141 |
+
"pa_block_size": 4096,
|
| 142 |
+
"pa_num_blocks": 8,
|
| 143 |
+
"padding_side": "right",
|
| 144 |
+
"pp_degree": 1,
|
| 145 |
+
"prefix_buckets": null,
|
| 146 |
+
"qk_layernorm": false,
|
| 147 |
+
"qkv_kernel_enabled": false,
|
| 148 |
+
"qkv_kernel_fuse_residual_add": false,
|
| 149 |
+
"qkv_kernel_nbsd_layout": false,
|
| 150 |
+
"quantization_dtype": "int8",
|
| 151 |
+
"quantization_type": "per_tensor_symmetric",
|
| 152 |
+
"quantize_clamp_bound": Infinity,
|
| 153 |
+
"quantized": false,
|
| 154 |
+
"quantized_checkpoints_path": null,
|
| 155 |
+
"quantized_mlp_kernel_enabled": false,
|
| 156 |
+
"rmsnorm_quantize_kernel_enabled": false,
|
| 157 |
+
"router_topk_nki_kernel_enabled": null,
|
| 158 |
+
"rpl_reduce_dtype": null,
|
| 159 |
+
"save_sharded_checkpoint": true,
|
| 160 |
+
"scratchpad_page_size": null,
|
| 161 |
+
"seq_len": 4096,
|
| 162 |
+
"seq_len_threshold_for_cc_tiling": 16384,
|
| 163 |
+
"sequence_parallel_enabled": false,
|
| 164 |
+
"shared_mlp_nki_kernel_enabled": null,
|
| 165 |
+
"skip_sharding": false,
|
| 166 |
+
"skip_warmup": false,
|
| 167 |
+
"spec_batch_size": 8,
|
| 168 |
+
"speculation_length": 0,
|
| 169 |
+
"start_rank_id": 0,
|
| 170 |
+
"strided_context_parallel_kernel_enabled": false,
|
| 171 |
+
"target": null,
|
| 172 |
+
"tensor_capture_config": null,
|
| 173 |
+
"tile_cc": false,
|
| 174 |
+
"tkg_batch_size": 8,
|
| 175 |
+
"token_generation_buckets": null,
|
| 176 |
+
"token_tree_config": null,
|
| 177 |
+
"torch_dtype": "bfloat16",
|
| 178 |
+
"tp_degree": 2,
|
| 179 |
+
"vocab_parallel": false,
|
| 180 |
+
"weight_gather_seq_len_threshold": 32768,
|
| 181 |
+
"weights_to_skip_layout_optimization": [],
|
| 182 |
+
"world_size": 2
|
| 183 |
+
},
|
| 184 |
+
"no_repeat_ngram_size": 0,
|
| 185 |
+
"num_attention_heads": 16,
|
| 186 |
+
"num_beam_groups": 1,
|
| 187 |
+
"num_beams": 1,
|
| 188 |
+
"num_cores_per_group": 1,
|
| 189 |
+
"num_hidden_layers": 28,
|
| 190 |
+
"num_key_value_heads": 8,
|
| 191 |
+
"num_return_sequences": 1,
|
| 192 |
+
"output_attentions": false,
|
| 193 |
+
"output_hidden_states": false,
|
| 194 |
+
"output_scores": false,
|
| 195 |
+
"pad_token_id": 0,
|
| 196 |
+
"prefix": null,
|
| 197 |
+
"problem_type": null,
|
| 198 |
+
"pruned_heads": {},
|
| 199 |
+
"remove_invalid_values": false,
|
| 200 |
+
"repetition_penalty": 1.0,
|
| 201 |
+
"return_dict": true,
|
| 202 |
+
"return_dict_in_generate": false,
|
| 203 |
+
"rms_norm_eps": 1e-06,
|
| 204 |
+
"rope_scaling": null,
|
| 205 |
+
"rope_theta": 1000000,
|
| 206 |
+
"sep_token_id": null,
|
| 207 |
+
"sliding_window": null,
|
| 208 |
+
"suppress_tokens": null,
|
| 209 |
+
"task_specific_params": null,
|
| 210 |
+
"temperature": 1.0,
|
| 211 |
+
"tf_legacy_loss": false,
|
| 212 |
+
"tie_encoder_decoder": false,
|
| 213 |
+
"tie_word_embeddings": true,
|
| 214 |
+
"tokenizer_class": null,
|
| 215 |
+
"top_k": 50,
|
| 216 |
+
"top_p": 1.0,
|
| 217 |
+
"torchscript": false,
|
| 218 |
+
"transformers_version": "4.51.0",
|
| 219 |
+
"typical_p": 1.0,
|
| 220 |
+
"use_bfloat16": false,
|
| 221 |
+
"use_cache": true,
|
| 222 |
+
"use_sliding_window": false,
|
| 223 |
+
"vocab_size": 151936
|
| 224 |
+
}
|
context_encoding_model/_tp0_bk5/command.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
neuronx-cc compile --framework=XLA model.MODULE_96a8f4e12dc810958634+b1e26cef.hlo_module.pb --output model.MODULE_96a8f4e12dc810958634+b1e26cef.neff --target=trn2 --auto-cast=none --model-type=transformer '--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma ' --lnc=2 -O1 '--internal-hlo2tensorizer-options= --modular-flow-mac-threshold=10 --verify-hlo=true' --logfile=log-neuron-cc.txt --verbose=35
|
context_encoding_model/_tp0_bk5/compile_flags.MODULE_96a8f4e12dc810958634+b1e26cef.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
["--target=trn2", "--auto-cast=none", "--model-type=transformer", "--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma ", "--lnc=2", "-O1", "--internal-hlo2tensorizer-options= --modular-flow-mac-threshold=10 --verify-hlo=true", "--logfile=/home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk5/log-neuron-cc.txt"]
|
context_encoding_model/_tp0_bk5/global_metric_store.json
ADDED
|
@@ -0,0 +1,1177 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"Average": {
|
| 3 |
+
"tensorizer": {
|
| 4 |
+
"StaticProfiler::AverageFractalPeUtilization": 99.12728881835938,
|
| 5 |
+
"StaticProfiler::AveragePartitionUtilization": 95.96998596191406,
|
| 6 |
+
"StaticProfiler::AveragePeUtilization": 97.68225860595703,
|
| 7 |
+
"StaticProfiler::LocalizationEfficiency": 56.908729553222656,
|
| 8 |
+
"StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 63.73067855834961,
|
| 9 |
+
"TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0,
|
| 10 |
+
"TilingProfiler::AveragePeUtilizationAfterTiling": 0.0
|
| 11 |
+
}
|
| 12 |
+
},
|
| 13 |
+
"Count": {
|
| 14 |
+
"tensorizer": {
|
| 15 |
+
"StaticProfiler::AverageFractalPeUtilization": 1.0,
|
| 16 |
+
"StaticProfiler::AveragePartitionUtilization": 1.0,
|
| 17 |
+
"StaticProfiler::AveragePeUtilization": 1.0,
|
| 18 |
+
"StaticProfiler::LocalizationEfficiency": 1.0,
|
| 19 |
+
"StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 1.0,
|
| 20 |
+
"TilingProfiler::AveragePartitionUtilizationAfterTiling": 1.0,
|
| 21 |
+
"TilingProfiler::AveragePeUtilizationAfterTiling": 1.0
|
| 22 |
+
}
|
| 23 |
+
},
|
| 24 |
+
"Sum": {
|
| 25 |
+
"compiletime": {
|
| 26 |
+
"AGOrderingAnalysisPass": 0.03893709182739258,
|
| 27 |
+
"AffinePredicateResolution": 0.00975942611694336,
|
| 28 |
+
"AliasDependencyElimination": 0.00020766258239746094,
|
| 29 |
+
"AliasDependencyInduction": 0.014848947525024414,
|
| 30 |
+
"AliasDependencyReset": 0.0507814884185791,
|
| 31 |
+
"BFComputeCutting": 0.004155397415161133,
|
| 32 |
+
"BirCodeGenLoop": 0.384446382522583,
|
| 33 |
+
"CCOpFusion": 0.11220550537109375,
|
| 34 |
+
"CanonicalizeConv": 1.8999999156221747e-05,
|
| 35 |
+
"CanonicalizeDAGForPGTiling": 0.013774633407592773,
|
| 36 |
+
"CanonicalizeForTensorizer": 5.0000002374872565e-05,
|
| 37 |
+
"CanonicalizeIR": 0.002764463424682617,
|
| 38 |
+
"Canonicalizer": 0.0008950000046752393,
|
| 39 |
+
"CoalesceCCOp": 0.01839923858642578,
|
| 40 |
+
"CommuteConcat": 0.0019075870513916016,
|
| 41 |
+
"DMALocalityOpt": 0.00996088981628418,
|
| 42 |
+
"DMAProfiler": 0.02422189712524414,
|
| 43 |
+
"DMATilingProfiler": 0.007188081741333008,
|
| 44 |
+
"DataLocalityOpt": 0.15634822845458984,
|
| 45 |
+
"DataStreaming": 0.03180813789367676,
|
| 46 |
+
"DeConcat": 0.0020532608032226563,
|
| 47 |
+
"DeadCodeElimination": 0.002146482467651367,
|
| 48 |
+
"DeadStoreElimination": 0.024139404296875,
|
| 49 |
+
"DelinearIndices": 0.013254880905151367,
|
| 50 |
+
"Delinearization": 0.007935047149658203,
|
| 51 |
+
"DelinearizeSPMD": 0.023029565811157227,
|
| 52 |
+
"DoNothing": 0.0005247592926025391,
|
| 53 |
+
"DramToDramTranspose": 0.012213945388793945,
|
| 54 |
+
"DumpGraphAndMetadata": 0.03455543518066406,
|
| 55 |
+
"EliminateDivs": 0.01893448829650879,
|
| 56 |
+
"ExpandBatchNorm": 0.007169485092163086,
|
| 57 |
+
"ExpandISAMacro": 0.019716739654541016,
|
| 58 |
+
"FactorizeBlkDims": 0.0747368335723877,
|
| 59 |
+
"FactorizeThreadAxesInFreeDims": 0.0075495243072509766,
|
| 60 |
+
"FlattenMacroLoop": 0.007609844207763672,
|
| 61 |
+
"GenericAccessSimplifier": 0.0013933181762695313,
|
| 62 |
+
"HoistCompute": 4.999999873689376e-06,
|
| 63 |
+
"IdentifyCrossPassTensors": 3.899999865097925e-05,
|
| 64 |
+
"InferInitValue": 0.10064125061035156,
|
| 65 |
+
"InferIntrinsicOnCC": 0.026311159133911133,
|
| 66 |
+
"InferNeuronTensor": 0.05008339881896973,
|
| 67 |
+
"InferNonlocalTensors": 0.05733203887939453,
|
| 68 |
+
"InferPSumTensor": 0.1221306324005127,
|
| 69 |
+
"InferShardAxis": 0.6304898262023926,
|
| 70 |
+
"InferSharedMemLoc": 0.0429539680480957,
|
| 71 |
+
"InlineNativeKernels": 0.00394749641418457,
|
| 72 |
+
"InsertCoreBarrier": 0.01845526695251465,
|
| 73 |
+
"InsertIOTransposes": 0.04183030128479004,
|
| 74 |
+
"InsertImplicitShardAxisBeforeISel": 0.01711416244506836,
|
| 75 |
+
"InsertLocalTransposes": 0.0077512264251708984,
|
| 76 |
+
"InsertOffloadedTransposes": 0.010181665420532227,
|
| 77 |
+
"LICM": 0.005186319351196289,
|
| 78 |
+
"LateLegalizeInst": 0.04364776611328125,
|
| 79 |
+
"LateLegalizePostSplit": 0.03845643997192383,
|
| 80 |
+
"LateLowerReshapeOp": 0.0019919872283935547,
|
| 81 |
+
"LateLowerTensorOp": 0.0022301673889160156,
|
| 82 |
+
"LateNeuronInstComb": 0.04980278015136719,
|
| 83 |
+
"LayoutPreprocessing": 0.05747699737548828,
|
| 84 |
+
"LayoutPreprocessingAndAnalysis": 0.09093403816223145,
|
| 85 |
+
"LayoutRequirementAnalysis": 0.010792970657348633,
|
| 86 |
+
"LegalizeCCOpLayout": 0.0032892227172851563,
|
| 87 |
+
"LegalizeOpLevelAlias": 0.0013661384582519531,
|
| 88 |
+
"LegalizePartitionReduce": 0.006167411804199219,
|
| 89 |
+
"LegalizeSundaAccess": 0.10145425796508789,
|
| 90 |
+
"LegalizeSundaMacro": 0.051756858825683594,
|
| 91 |
+
"LegalizeType": 0.07339167594909668,
|
| 92 |
+
"LocalLayoutOpt": 0.021276235580444336,
|
| 93 |
+
"LoopFusion": 0.006464719772338867,
|
| 94 |
+
"LoopSplitting": 0.0007054805755615234,
|
| 95 |
+
"LowerBroadcast": 0.01979851722717285,
|
| 96 |
+
"LowerCCOpBlockAxis": 0.008892297744750977,
|
| 97 |
+
"LowerComplexBroadcast": 0.0035398006439208984,
|
| 98 |
+
"LowerIntrinsics": 0.05094194412231445,
|
| 99 |
+
"LowerShardAxis": 0.04483389854431152,
|
| 100 |
+
"LowerTensorOp": 0.025528907775878906,
|
| 101 |
+
"LowerToSendRecv": 0.04537153244018555,
|
| 102 |
+
"LowerTranspose": 0.040845394134521484,
|
| 103 |
+
"MacroGeneration": 0.08503556251525879,
|
| 104 |
+
"MaskPropagation": 0.007714748382568359,
|
| 105 |
+
"MemcastMotion": 1.9999999494757503e-05,
|
| 106 |
+
"MemcpyElimination": 0.062020301818847656,
|
| 107 |
+
"MutateDataType": 0.0020122528076171875,
|
| 108 |
+
"NeuronAliasDependencyInduction": 0.0006520748138427734,
|
| 109 |
+
"NeuronAliasDependencyReset": 0.10503625869750977,
|
| 110 |
+
"NeuronInstComb": 0.057951927185058594,
|
| 111 |
+
"NeuronLICM": 0.05489492416381836,
|
| 112 |
+
"NeuronLoopFusion": 0.05422854423522949,
|
| 113 |
+
"NeuronLoopInterchange": 0.0029349327087402344,
|
| 114 |
+
"NeuronSimplifier": 0.026484966278076172,
|
| 115 |
+
"NeuronSimplifyPredicates": 0.04440903663635254,
|
| 116 |
+
"NeuronValueNumbering": 0.02174234390258789,
|
| 117 |
+
"OptimizeAliasedCopyChain": 0.0018880367279052734,
|
| 118 |
+
"OptimizeNKIKernels": 4.115047454833984,
|
| 119 |
+
"PAGLayoutOpt": 0.11529350280761719,
|
| 120 |
+
"PComputeCutting": 0.010918140411376953,
|
| 121 |
+
"PGLayoutTilingPipeline": 1.6512439250946045,
|
| 122 |
+
"PGTiling": 0.2841973304748535,
|
| 123 |
+
"PadElimination": 0.0008590221405029297,
|
| 124 |
+
"ParAxesAnnotation": 0.07899093627929688,
|
| 125 |
+
"PartialLoopFusion": 0.03534102439880371,
|
| 126 |
+
"PartialSimdFusion": 0.021408557891845703,
|
| 127 |
+
"PenguinizeFunctions": 4.70000013592653e-05,
|
| 128 |
+
"PerfectLoopNest": 0.008621454238891602,
|
| 129 |
+
"PruneFunctions": 4.70000013592653e-05,
|
| 130 |
+
"RecognizeOpIdiom": 0.010253190994262695,
|
| 131 |
+
"Recompute": 0.0005791187286376953,
|
| 132 |
+
"RelaxPredicates": 0.013797521591186523,
|
| 133 |
+
"Rematerialization": 0.0054569244384765625,
|
| 134 |
+
"RemoveOptimizationBarriers": 4.5000000682193786e-05,
|
| 135 |
+
"RemoveShardedPartitionAxes": 0.03261446952819824,
|
| 136 |
+
"ReshapeWeights": 0.001524209976196289,
|
| 137 |
+
"ResolveAccessConflict": 0.019870281219482422,
|
| 138 |
+
"ResolveComplicatePredicates": 0.0053920745849609375,
|
| 139 |
+
"RewriteReplicationMatmul": 0.0025107860565185547,
|
| 140 |
+
"RewriteWeights": 0.009802579879760742,
|
| 141 |
+
"SFKVectorizer": 0.3575756549835205,
|
| 142 |
+
"ScatterMotion": 3.899999865097925e-05,
|
| 143 |
+
"ShardingPropagationAnalysis": 0.10757136344909668,
|
| 144 |
+
"SimpleAllReduceTiling": 0.015942096710205078,
|
| 145 |
+
"Simplifier": 0.005366325378417969,
|
| 146 |
+
"SimplifyMacroPredicates": 0.016243934631347656,
|
| 147 |
+
"SimplifyNeuronTensor": 0.16969990730285645,
|
| 148 |
+
"SimplifySlice": 0.002231597900390625,
|
| 149 |
+
"SimplifyTensor": 0.017529726028442383,
|
| 150 |
+
"SpillPSum": 0.20494413375854492,
|
| 151 |
+
"SplitAPUnionSets": 0.15779972076416016,
|
| 152 |
+
"SplitAccGrp": 0.005539894104003906,
|
| 153 |
+
"StaticProfiler": 0.046514272689819336,
|
| 154 |
+
"StaticTransposeLocalTensor": 0.008464574813842773,
|
| 155 |
+
"SundaISel": 0.07130837440490723,
|
| 156 |
+
"TCTransform": 0.002462148666381836,
|
| 157 |
+
"TensorInitialization": 0.011480093002319336,
|
| 158 |
+
"TensorOpSimplifier": 0.008947134017944336,
|
| 159 |
+
"TensorOpTransform": 0.06947088241577148,
|
| 160 |
+
"TensorizerLegalizationPass": 5.699999746866524e-05,
|
| 161 |
+
"TileCCOps": 0.012774467468261719,
|
| 162 |
+
"TilingProfiler": 0.014863967895507813,
|
| 163 |
+
"TransformConvOp": 0.006424665451049805,
|
| 164 |
+
"TritiumFusion": 0.11082077026367188,
|
| 165 |
+
"ValueNumbering": 0.0049648284912109375,
|
| 166 |
+
"VectorizeDMA": 0.004624843597412109,
|
| 167 |
+
"VectorizeMatMult": 0.028928518295288086,
|
| 168 |
+
"VerifySupportedOps": 3.899999865097925e-05,
|
| 169 |
+
"WeightCoalescing": 0.013041973114013672,
|
| 170 |
+
"ZeroSizeTensorElimination": 0.00021529197692871094,
|
| 171 |
+
"algsimp": 0.001961000030860305,
|
| 172 |
+
"batchnorm_expander": 3.5000000934815034e-05,
|
| 173 |
+
"boundary-marker-removal": 1.1000000085914508e-05,
|
| 174 |
+
"call-inliner": 0.0003279999946244061,
|
| 175 |
+
"canonicalize-boundary-marker": 1.3999999282532372e-05,
|
| 176 |
+
"collective-stream-id-checker": 9.40000027185306e-05,
|
| 177 |
+
"comparison-expander": 0.000506000011228025,
|
| 178 |
+
"computation-deduplicator": 5.499999679159373e-05,
|
| 179 |
+
"config-lowering": 0.00011899999663000926,
|
| 180 |
+
"constant-statistics": 0.0004400000034365803,
|
| 181 |
+
"constant_folding": 0.00030700000934302807,
|
| 182 |
+
"cse": 3.5000000934815034e-05,
|
| 183 |
+
"dce": 7.700000423938036e-05,
|
| 184 |
+
"dot_decomposer": 0.0009110000100918114,
|
| 185 |
+
"dynamic-slice-transpose": 1.1000000085914508e-05,
|
| 186 |
+
"eliminate-redundant-compare": 0.00028899998869746923,
|
| 187 |
+
"emit-offloaded-dropout": 5.6000000768108293e-05,
|
| 188 |
+
"flatten-call-graph": 0.0006600000197067857,
|
| 189 |
+
"fuse-send-recv": 5.5999997130129486e-05,
|
| 190 |
+
"hilo-conditional-to-select": 1.3999999282532372e-05,
|
| 191 |
+
"hilo::LegalizeAlias": 1.2000000424450263e-05,
|
| 192 |
+
"hilo::NeuronInstCombine": 0.00012000000424450263,
|
| 193 |
+
"hilo::NeuronOpFusion": 1.4999999621068127e-05,
|
| 194 |
+
"hilo::ReplaceTokenTypeWithU8Pass": 3.599999763537198e-05,
|
| 195 |
+
"hilo::ScheduleFusion": 4.999999873689376e-06,
|
| 196 |
+
"hilo::SixtyFourHack": 9.899999713525176e-05,
|
| 197 |
+
"hilo::VerifyAliasing": 6.000000212225132e-06,
|
| 198 |
+
"hlo-mac-count": 0.012987000867724419,
|
| 199 |
+
"instruction-histogram": 0.0007619999814778566,
|
| 200 |
+
"io-con-pipe-begin": 7.000000096013537e-06,
|
| 201 |
+
"io-con-pipe-end": 9.999999974752427e-07,
|
| 202 |
+
"io-layout-normalization": 0.001361000002361834,
|
| 203 |
+
"io-statistics": 5.6000000768108293e-05,
|
| 204 |
+
"legalize-ccops-for-tensorizer": 3.000000106112566e-06,
|
| 205 |
+
"legalize-compare": 1.1000000085914508e-05,
|
| 206 |
+
"lower-argminmax-custom-call": 9.999999747378752e-06,
|
| 207 |
+
"map-inline": 0.0008399999933317304,
|
| 208 |
+
"metadata-naming": 4.400000034365803e-05,
|
| 209 |
+
"mlir::detail::OpToOpPassAdaptor": 5.7999997807201e-05,
|
| 210 |
+
"mlir::hlo::MhloToPyPenguin": 0.010812999680638313,
|
| 211 |
+
"mlir::mhlo::LowerComplexExtraPass": 0.0003440000000409782,
|
| 212 |
+
"mlir::mhlo::LowerComplexPass": 0.0004799999878741801,
|
| 213 |
+
"native-to-custom-softmax": 0.00035600000410340726,
|
| 214 |
+
"native-to-custom-softmax-dx": 0.0006880000000819564,
|
| 215 |
+
"neuron-hlo-verifier": 0.01168300025165081,
|
| 216 |
+
"operand_upcaster": 5.5999997130129486e-05,
|
| 217 |
+
"opt-barrier-removal": 0.0003150000120513141,
|
| 218 |
+
"post-par-pipe-begin": 1.4000000192027073e-05,
|
| 219 |
+
"post-par-pipe-end": 0.0,
|
| 220 |
+
"post-partition-simplification": 0.0015290000010281801,
|
| 221 |
+
"pre-par-pipe-begin": 9.999999974752427e-07,
|
| 222 |
+
"pre-par-pipe-end": 0.0,
|
| 223 |
+
"pre-partition-simplification": 0.09849400073289871,
|
| 224 |
+
"replace-minimum-constant": 0.0004569999873638153,
|
| 225 |
+
"reshape-mover": 0.0001030000057653524,
|
| 226 |
+
"simplify-concat": 0.00010699999984353781,
|
| 227 |
+
"simplify-while-loops": 7.700000423938036e-05,
|
| 228 |
+
"transform-variadic-reduce": 5.8999998145736754e-05,
|
| 229 |
+
"tuple-simplifier": 0.00025900002219714224,
|
| 230 |
+
"unpack-nested-aws-ntwsr": 0.00023599999258294702,
|
| 231 |
+
"unroll-while-loop": 1.2000000424450263e-05,
|
| 232 |
+
"zero_sized_hlo_elimination": 0.000783999974373728
|
| 233 |
+
},
|
| 234 |
+
"hilo": {
|
| 235 |
+
"ConstantSize": 7348863.0,
|
| 236 |
+
"HloInputCount": 371.0,
|
| 237 |
+
"HloMacCount": 240674799616.0,
|
| 238 |
+
"HloOutputCount": 57.0,
|
| 239 |
+
"IfmapSize": 3910944768.0,
|
| 240 |
+
"OfmapSize": 1879048192.0,
|
| 241 |
+
"OutputsReadFromCount": 0.0,
|
| 242 |
+
"PassthroughTensorsCount": 0.0,
|
| 243 |
+
"RedundantOutputCount": 0.0,
|
| 244 |
+
"Traffic": 1088551040.0
|
| 245 |
+
},
|
| 246 |
+
"tensorizer": {
|
| 247 |
+
"DMATilingProfiler::TotalInstructionsAfterTiling": 31232.0,
|
| 248 |
+
"StaticProfiler::AifUb": 538.6357421875,
|
| 249 |
+
"StaticProfiler::ArithmeticIntensityTensorizer": 306.53076171875,
|
| 250 |
+
"StaticProfiler::AverageDmaLength": 2517.368896484375,
|
| 251 |
+
"StaticProfiler::DDRTransferBytes": 672177216.0,
|
| 252 |
+
"StaticProfiler::InternalTransferBytes": 407820064.0,
|
| 253 |
+
"StaticProfiler::LoadExpanded": 189029.0,
|
| 254 |
+
"StaticProfiler::StoreExpanded": 13673.0,
|
| 255 |
+
"StaticProfiler::TotalDMAExpanded": 202702.0,
|
| 256 |
+
"StaticProfiler::TotalDynamicInstancesCount": 37700.0,
|
| 257 |
+
"StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 37249.0,
|
| 258 |
+
"StaticProfiler::TotalLNCComm": 0.0,
|
| 259 |
+
"StaticProfiler::TotalLNCCommTransfer": 0.0,
|
| 260 |
+
"TilingProfiler::BatchnormInstructionsAfterTiling": 0.0,
|
| 261 |
+
"TilingProfiler::DmaInstructionsAfterTiling": 0.0,
|
| 262 |
+
"TilingProfiler::GenericInstructionsAfterTiling": 4.0,
|
| 263 |
+
"TilingProfiler::MatMultInstructionsAfterTiling": 18720.0,
|
| 264 |
+
"TilingProfiler::NumPfTransposes": 5.0,
|
| 265 |
+
"TilingProfiler::NumPfTransposesForIo": 1.0,
|
| 266 |
+
"TilingProfiler::NumPfTransposesForLocal": 1.0,
|
| 267 |
+
"TilingProfiler::NumPfTransposesForNonlocal": 3.0,
|
| 268 |
+
"TilingProfiler::PfTransposeInstructions": 11041.0,
|
| 269 |
+
"TilingProfiler::PfTransposeInstructionsForIo": 9504.0,
|
| 270 |
+
"TilingProfiler::PfTransposeInstructionsForLocal": 1.0,
|
| 271 |
+
"TilingProfiler::PfTransposeInstructionsForNonlocal": 1536.0,
|
| 272 |
+
"TilingProfiler::ReduceInstructionsAfterTiling": 18.0,
|
| 273 |
+
"TilingProfiler::SimdInstructionsAfterTiling": 604.0,
|
| 274 |
+
"TilingProfiler::TotalInstructionsAfterTiling": 0.0,
|
| 275 |
+
"TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0,
|
| 276 |
+
"TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0,
|
| 277 |
+
"TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0,
|
| 278 |
+
"TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0,
|
| 279 |
+
"TransformConvOp::conv2d_column_packing": 0.0,
|
| 280 |
+
"TransformConvOp::conv2d_column_packing_1": 0.0,
|
| 281 |
+
"TransformConvOp::conv2d_column_packing_io10": 0.0,
|
| 282 |
+
"TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0
|
| 283 |
+
}
|
| 284 |
+
},
|
| 285 |
+
"all": {
|
| 286 |
+
"compiletime": {
|
| 287 |
+
"algsimp": 0.0018090000376105309,
|
| 288 |
+
"call-inliner": 0.0003000000142492354,
|
| 289 |
+
"collective-stream-id-checker": 8.499999967170879e-05,
|
| 290 |
+
"comparison-expander": 0.0004920000210404396,
|
| 291 |
+
"constant-statistics": 0.0004400000034365803,
|
| 292 |
+
"constant_folding": 0.00028300000121816993,
|
| 293 |
+
"dce": 7.400000322377309e-05,
|
| 294 |
+
"dot_decomposer": 0.0009110000100918114,
|
| 295 |
+
"eliminate-redundant-compare": 0.0002789999998640269,
|
| 296 |
+
"flatten-call-graph": 0.0006380000268109143,
|
| 297 |
+
"hlo-mac-count": 0.007658000104129314,
|
| 298 |
+
"instruction-histogram": 0.0007619999814778566,
|
| 299 |
+
"io-con-pipe-begin": 7.000000096013537e-06,
|
| 300 |
+
"io-con-pipe-end": 9.999999974752427e-07,
|
| 301 |
+
"io-layout-normalization": 0.001361000002361834,
|
| 302 |
+
"io-statistics": 5.6000000768108293e-05,
|
| 303 |
+
"map-inline": 0.0008089999901130795,
|
| 304 |
+
"native-to-custom-softmax": 0.000307999987853691,
|
| 305 |
+
"native-to-custom-softmax-dx": 0.0004140000091865659,
|
| 306 |
+
"neuron-hlo-verifier": 0.010607999749481678,
|
| 307 |
+
"opt-barrier-removal": 0.0003150000120513141,
|
| 308 |
+
"pre-par-pipe-begin": 9.999999974752427e-07,
|
| 309 |
+
"pre-par-pipe-end": 0.0,
|
| 310 |
+
"pre-partition-simplification": 0.09849400073289871,
|
| 311 |
+
"replace-minimum-constant": 0.00043899999582208693,
|
| 312 |
+
"reshape-mover": 9.500000305706635e-05,
|
| 313 |
+
"simplify-while-loops": 7.100000220816582e-05,
|
| 314 |
+
"tuple-simplifier": 0.0002460000105202198,
|
| 315 |
+
"unpack-nested-aws-ntwsr": 0.00022600000374950469,
|
| 316 |
+
"unroll-while-loop": 1.2000000424450263e-05,
|
| 317 |
+
"zero_sized_hlo_elimination": 0.000783999974373728
|
| 318 |
+
}
|
| 319 |
+
},
|
| 320 |
+
"attention_isa_kernel": {
|
| 321 |
+
"compiletime": {
|
| 322 |
+
"CoalesceCCOp": 0.00021338462829589844,
|
| 323 |
+
"DMALocalityOpt": 0.0002186298370361328,
|
| 324 |
+
"DMAProfiler": 0.00027632713317871094,
|
| 325 |
+
"DataStreaming": 0.00021409988403320313,
|
| 326 |
+
"DoNothing": 0.0033321380615234375,
|
| 327 |
+
"ExpandISAMacro": 0.00029921531677246094,
|
| 328 |
+
"FactorizeBlkDims": 0.000396728515625,
|
| 329 |
+
"InferPSumTensor": 0.0006210803985595703,
|
| 330 |
+
"InferSharedMemLoc": 0.0006666183471679688,
|
| 331 |
+
"InsertCoreBarrier": 0.00035572052001953125,
|
| 332 |
+
"LateLegalizeInst": 0.00023174285888671875,
|
| 333 |
+
"LateNeuronInstComb": 0.000492095947265625,
|
| 334 |
+
"LegalizeSundaAccess": 0.0002181529998779297,
|
| 335 |
+
"LegalizeType": 0.0002846717834472656,
|
| 336 |
+
"LowerBroadcast": 0.00025916099548339844,
|
| 337 |
+
"LowerIntrinsics": 0.00029730796813964844,
|
| 338 |
+
"LowerTranspose": 0.0002589225769042969,
|
| 339 |
+
"NeuronInstComb": 0.000469207763671875,
|
| 340 |
+
"NeuronLICM": 0.00020599365234375,
|
| 341 |
+
"NeuronSimplifyPredicates": 0.0002067089080810547,
|
| 342 |
+
"NeuronValueNumbering": 0.0002777576446533203,
|
| 343 |
+
"SFKVectorizer": 0.0018928050994873047,
|
| 344 |
+
"SimpleAllReduceTiling": 0.00020241737365722656,
|
| 345 |
+
"SimplifyNeuronTensor": 0.0006334781646728516,
|
| 346 |
+
"SpillPSum": 0.0007383823394775391,
|
| 347 |
+
"WeightCoalescing": 0.00025081634521484375
|
| 348 |
+
}
|
| 349 |
+
},
|
| 350 |
+
"cumsum": {
|
| 351 |
+
"compiletime": {
|
| 352 |
+
"CoalesceCCOp": 0.0003447532653808594,
|
| 353 |
+
"DMALocalityOpt": 0.0003294944763183594,
|
| 354 |
+
"DMAProfiler": 0.0012810230255126953,
|
| 355 |
+
"DataStreaming": 0.0005331039428710938,
|
| 356 |
+
"DoNothing": 0.00017762184143066406,
|
| 357 |
+
"ExpandISAMacro": 0.0009202957153320313,
|
| 358 |
+
"FactorizeBlkDims": 0.0006163120269775391,
|
| 359 |
+
"InferPSumTensor": 0.0011057853698730469,
|
| 360 |
+
"InferSharedMemLoc": 0.0004899501800537109,
|
| 361 |
+
"InsertCoreBarrier": 0.0004894733428955078,
|
| 362 |
+
"LateLegalizeInst": 0.0006704330444335938,
|
| 363 |
+
"LateNeuronInstComb": 0.0013632774353027344,
|
| 364 |
+
"LegalizeSundaAccess": 0.0025315284729003906,
|
| 365 |
+
"LegalizeType": 0.00039649009704589844,
|
| 366 |
+
"LowerBroadcast": 0.0004820823669433594,
|
| 367 |
+
"LowerIntrinsics": 0.0004119873046875,
|
| 368 |
+
"LowerTranspose": 0.0004839897155761719,
|
| 369 |
+
"NeuronInstComb": 0.0013201236724853516,
|
| 370 |
+
"NeuronLICM": 0.0006861686706542969,
|
| 371 |
+
"NeuronSimplifyPredicates": 0.011016607284545898,
|
| 372 |
+
"NeuronValueNumbering": 0.0007073879241943359,
|
| 373 |
+
"SFKVectorizer": 0.012517213821411133,
|
| 374 |
+
"SimpleAllReduceTiling": 0.0003895759582519531,
|
| 375 |
+
"SimplifyNeuronTensor": 0.0022177696228027344,
|
| 376 |
+
"SpillPSum": 0.0009493827819824219,
|
| 377 |
+
"WeightCoalescing": 0.00035071372985839844
|
| 378 |
+
}
|
| 379 |
+
},
|
| 380 |
+
"sg00": {
|
| 381 |
+
"compiletime": {
|
| 382 |
+
"CanonicalizeConv": 1.2999999853491317e-05,
|
| 383 |
+
"CanonicalizeForTensorizer": 1.8000000636675395e-05,
|
| 384 |
+
"Canonicalizer": 0.00029600001289509237,
|
| 385 |
+
"HoistCompute": 3.000000106112566e-06,
|
| 386 |
+
"IdentifyCrossPassTensors": 1.4000000192027073e-05,
|
| 387 |
+
"MemcastMotion": 7.999999979801942e-06,
|
| 388 |
+
"PenguinizeFunctions": 1.8000000636675395e-05,
|
| 389 |
+
"PruneFunctions": 3.000000106112566e-06,
|
| 390 |
+
"RemoveOptimizationBarriers": 1.700000029813964e-05,
|
| 391 |
+
"ScatterMotion": 1.700000029813964e-05,
|
| 392 |
+
"TensorizerLegalizationPass": 2.9000000722589903e-05,
|
| 393 |
+
"VerifySupportedOps": 1.2000000424450263e-05,
|
| 394 |
+
"algsimp": 4.8000001697801054e-05,
|
| 395 |
+
"batchnorm_expander": 1.2000000424450263e-05,
|
| 396 |
+
"boundary-marker-removal": 3.999999989900971e-06,
|
| 397 |
+
"call-inliner": 7.000000096013537e-06,
|
| 398 |
+
"canonicalize-boundary-marker": 4.999999873689376e-06,
|
| 399 |
+
"collective-stream-id-checker": 3.000000106112566e-06,
|
| 400 |
+
"comparison-expander": 3.999999989900971e-06,
|
| 401 |
+
"computation-deduplicator": 1.5999999959603883e-05,
|
| 402 |
+
"config-lowering": 4.3000000005122274e-05,
|
| 403 |
+
"constant_folding": 7.000000096013537e-06,
|
| 404 |
+
"cse": 1.1000000085914508e-05,
|
| 405 |
+
"dce": 9.999999974752427e-07,
|
| 406 |
+
"dynamic-slice-transpose": 3.999999989900971e-06,
|
| 407 |
+
"eliminate-redundant-compare": 3.000000106112566e-06,
|
| 408 |
+
"emit-offloaded-dropout": 3.199999991920777e-05,
|
| 409 |
+
"flatten-call-graph": 7.000000096013537e-06,
|
| 410 |
+
"fuse-send-recv": 1.8000000636675395e-05,
|
| 411 |
+
"hilo-conditional-to-select": 3.999999989900971e-06,
|
| 412 |
+
"hilo::LegalizeAlias": 4.999999873689376e-06,
|
| 413 |
+
"hilo::NeuronInstCombine": 4.8999998398358e-05,
|
| 414 |
+
"hilo::NeuronOpFusion": 4.999999873689376e-06,
|
| 415 |
+
"hilo::ReplaceTokenTypeWithU8Pass": 1.4000000192027073e-05,
|
| 416 |
+
"hilo::ScheduleFusion": 1.9999999949504854e-06,
|
| 417 |
+
"hilo::SixtyFourHack": 1.5999999959603883e-05,
|
| 418 |
+
"hilo::VerifyAliasing": 3.000000106112566e-06,
|
| 419 |
+
"hlo-mac-count": 8.299999899463728e-05,
|
| 420 |
+
"legalize-ccops-for-tensorizer": 9.999999974752427e-07,
|
| 421 |
+
"legalize-compare": 3.999999989900971e-06,
|
| 422 |
+
"lower-argminmax-custom-call": 3.000000106112566e-06,
|
| 423 |
+
"map-inline": 9.999999747378752e-06,
|
| 424 |
+
"metadata-naming": 1.2999999853491317e-05,
|
| 425 |
+
"mlir::detail::OpToOpPassAdaptor": 1.9999999494757503e-05,
|
| 426 |
+
"mlir::hlo::MhloToPyPenguin": 0.0010160000529140234,
|
| 427 |
+
"mlir::mhlo::LowerComplexExtraPass": 0.00013899999612476677,
|
| 428 |
+
"mlir::mhlo::LowerComplexPass": 0.0002699999895412475,
|
| 429 |
+
"native-to-custom-softmax": 3.7999998312443495e-05,
|
| 430 |
+
"native-to-custom-softmax-dx": 0.00024399999529123306,
|
| 431 |
+
"neuron-hlo-verifier": 0.0003870000073220581,
|
| 432 |
+
"operand_upcaster": 1.700000029813964e-05,
|
| 433 |
+
"post-par-pipe-begin": 1.2000000424450263e-05,
|
| 434 |
+
"post-par-pipe-end": 0.0,
|
| 435 |
+
"post-partition-simplification": 0.0005039999959990382,
|
| 436 |
+
"replace-minimum-constant": 4.999999873689376e-06,
|
| 437 |
+
"reshape-mover": 3.000000106112566e-06,
|
| 438 |
+
"simplify-concat": 3.300000025774352e-05,
|
| 439 |
+
"simplify-while-loops": 1.9999999949504854e-06,
|
| 440 |
+
"transform-variadic-reduce": 7.000000096013537e-06,
|
| 441 |
+
"tuple-simplifier": 3.999999989900971e-06,
|
| 442 |
+
"unpack-nested-aws-ntwsr": 3.000000106112566e-06,
|
| 443 |
+
"unroll-while-loop": 0.0
|
| 444 |
+
},
|
| 445 |
+
"hilo": {
|
| 446 |
+
"ArithmeticIntensity": 183.30274963378906,
|
| 447 |
+
"ConstantSize": 7348863.0,
|
| 448 |
+
"HloInputCount": 371.0,
|
| 449 |
+
"HloMacCount": 42949672960.0,
|
| 450 |
+
"HloOutputCount": 57.0,
|
| 451 |
+
"IfmapSize": 3910944768.0,
|
| 452 |
+
"OfmapSize": 1879048192.0,
|
| 453 |
+
"OutputsReadFromCount": 0.0,
|
| 454 |
+
"PassthroughTensorsCount": 0.0,
|
| 455 |
+
"RedundantOutputCount": 0.0,
|
| 456 |
+
"Traffic": 468620064.0
|
| 457 |
+
}
|
| 458 |
+
},
|
| 459 |
+
"sg0000": {
|
| 460 |
+
"compiletime": {
|
| 461 |
+
"AGOrderingAnalysisPass": 0.04064464569091797,
|
| 462 |
+
"AffinePredicateResolution": 0.0019383430480957031,
|
| 463 |
+
"AliasDependencyElimination": 0.00022459030151367188,
|
| 464 |
+
"AliasDependencyInduction": 0.019460439682006836,
|
| 465 |
+
"AliasDependencyReset": 0.04814887046813965,
|
| 466 |
+
"BFComputeCutting": 0.0055506229400634766,
|
| 467 |
+
"BirCodeGenLoop": 0.13215899467468262,
|
| 468 |
+
"CCOpFusion": 0.11969184875488281,
|
| 469 |
+
"CanonicalizeDAGForPGTiling": 0.0033049583435058594,
|
| 470 |
+
"CanonicalizeIR": 0.0060040950775146484,
|
| 471 |
+
"CoalesceCCOp": 0.0054624080657958984,
|
| 472 |
+
"CommuteConcat": 0.002767324447631836,
|
| 473 |
+
"DMALocalityOpt": 0.0027179718017578125,
|
| 474 |
+
"DMAProfiler": 0.01582622528076172,
|
| 475 |
+
"DMATilingProfiler": 0.008585929870605469,
|
| 476 |
+
"DataLocalityOpt": 0.2429823875427246,
|
| 477 |
+
"DataStreaming": 0.013686180114746094,
|
| 478 |
+
"DeConcat": 0.0028448104858398438,
|
| 479 |
+
"DeadCodeElimination": 0.00874471664428711,
|
| 480 |
+
"DeadStoreElimination": 0.07823586463928223,
|
| 481 |
+
"DelinearIndices": 0.01836085319519043,
|
| 482 |
+
"Delinearization": 0.009904146194458008,
|
| 483 |
+
"DelinearizeSPMD": 0.03007340431213379,
|
| 484 |
+
"DoNothing": 9.870529174804688e-05,
|
| 485 |
+
"DramToDramTranspose": 0.014807701110839844,
|
| 486 |
+
"DumpGraphAndMetadata": 0.00868082046508789,
|
| 487 |
+
"EliminateDivs": 0.005564212799072266,
|
| 488 |
+
"ExpandBatchNorm": 0.0029854774475097656,
|
| 489 |
+
"ExpandISAMacro": 0.006433963775634766,
|
| 490 |
+
"FactorizeBlkDims": 0.06867551803588867,
|
| 491 |
+
"FactorizeThreadAxesInFreeDims": 0.008321523666381836,
|
| 492 |
+
"FlattenMacroLoop": 0.006778717041015625,
|
| 493 |
+
"GenericAccessSimplifier": 0.0014896392822265625,
|
| 494 |
+
"InferInitValue": 0.06406569480895996,
|
| 495 |
+
"InferIntrinsicOnCC": 0.022037982940673828,
|
| 496 |
+
"InferNeuronTensor": 0.06763529777526855,
|
| 497 |
+
"InferNonlocalTensors": 0.22275519371032715,
|
| 498 |
+
"InferPSumTensor": 0.15494084358215332,
|
| 499 |
+
"InferShardAxis": 0.5209276676177979,
|
| 500 |
+
"InferSharedMemLoc": 0.017581939697265625,
|
| 501 |
+
"InlineNativeKernels": 0.007895946502685547,
|
| 502 |
+
"InsertCoreBarrier": 0.014360427856445313,
|
| 503 |
+
"InsertIOTransposes": 0.026629209518432617,
|
| 504 |
+
"InsertImplicitShardAxisBeforeISel": 0.018111467361450195,
|
| 505 |
+
"InsertLocalTransposes": 0.02471637725830078,
|
| 506 |
+
"InsertOffloadedTransposes": 0.018056154251098633,
|
| 507 |
+
"LICM": 0.006089210510253906,
|
| 508 |
+
"LateLegalizeInst": 0.020943164825439453,
|
| 509 |
+
"LateLegalizePostSplit": 0.01616668701171875,
|
| 510 |
+
"LateLowerReshapeOp": 0.004019498825073242,
|
| 511 |
+
"LateLowerTensorOp": 0.014237642288208008,
|
| 512 |
+
"LateNeuronInstComb": 0.02029895782470703,
|
| 513 |
+
"LayoutPreprocessing": 0.09618091583251953,
|
| 514 |
+
"LayoutPreprocessingAndAnalysis": 0.1460561752319336,
|
| 515 |
+
"LayoutRequirementAnalysis": 0.01375579833984375,
|
| 516 |
+
"LegalizeCCOpLayout": 0.004752159118652344,
|
| 517 |
+
"LegalizeOpLevelAlias": 0.001943826675415039,
|
| 518 |
+
"LegalizePartitionReduce": 0.002205371856689453,
|
| 519 |
+
"LegalizeSundaAccess": 0.08727788925170898,
|
| 520 |
+
"LegalizeSundaMacro": 0.017870187759399414,
|
| 521 |
+
"LegalizeType": 0.01916980743408203,
|
| 522 |
+
"LocalLayoutOpt": 0.049512386322021484,
|
| 523 |
+
"LoopFusion": 0.012260913848876953,
|
| 524 |
+
"LoopSplitting": 0.0006864070892333984,
|
| 525 |
+
"LowerBroadcast": 0.006807088851928711,
|
| 526 |
+
"LowerCCOpBlockAxis": 0.007787466049194336,
|
| 527 |
+
"LowerComplexBroadcast": 0.004546642303466797,
|
| 528 |
+
"LowerIntrinsics": 0.04405355453491211,
|
| 529 |
+
"LowerShardAxis": 0.033060312271118164,
|
| 530 |
+
"LowerTensorOp": 0.026821613311767578,
|
| 531 |
+
"LowerToSendRecv": 0.011995553970336914,
|
| 532 |
+
"LowerTranspose": 0.02594161033630371,
|
| 533 |
+
"MacroGeneration": 0.11522269248962402,
|
| 534 |
+
"MaskPropagation": 0.003435373306274414,
|
| 535 |
+
"MemcpyElimination": 0.2497720718383789,
|
| 536 |
+
"MutateDataType": 0.0027208328247070313,
|
| 537 |
+
"NeuronAliasDependencyInduction": 0.002033233642578125,
|
| 538 |
+
"NeuronAliasDependencyReset": 0.07921051979064941,
|
| 539 |
+
"NeuronInstComb": 0.018134593963623047,
|
| 540 |
+
"NeuronLICM": 0.037050724029541016,
|
| 541 |
+
"NeuronLoopFusion": 0.037982940673828125,
|
| 542 |
+
"NeuronLoopInterchange": 0.0038917064666748047,
|
| 543 |
+
"NeuronSimplifier": 0.022843360900878906,
|
| 544 |
+
"NeuronSimplifyPredicates": 0.003104686737060547,
|
| 545 |
+
"NeuronValueNumbering": 0.009130239486694336,
|
| 546 |
+
"OptimizeAliasedCopyChain": 0.004662990570068359,
|
| 547 |
+
"OptimizeNKIKernels": 0.3685793876647949,
|
| 548 |
+
"PAGLayoutOpt": 0.6570594310760498,
|
| 549 |
+
"PComputeCutting": 0.012747764587402344,
|
| 550 |
+
"PGLayoutTilingPipeline": 2.4684011936187744,
|
| 551 |
+
"PGTiling": 0.4522573947906494,
|
| 552 |
+
"PadElimination": 0.005415916442871094,
|
| 553 |
+
"ParAxesAnnotation": 0.5855293273925781,
|
| 554 |
+
"PartialLoopFusion": 0.06675910949707031,
|
| 555 |
+
"PartialSimdFusion": 0.07990288734436035,
|
| 556 |
+
"PerfectLoopNest": 0.004445075988769531,
|
| 557 |
+
"RecognizeOpIdiom": 0.02440333366394043,
|
| 558 |
+
"Recompute": 0.0006387233734130859,
|
| 559 |
+
"RelaxPredicates": 0.0069468021392822266,
|
| 560 |
+
"Rematerialization": 0.011609554290771484,
|
| 561 |
+
"RemoveShardedPartitionAxes": 0.029452085494995117,
|
| 562 |
+
"ReshapeWeights": 0.0011801719665527344,
|
| 563 |
+
"ResolveAccessConflict": 0.012258052825927734,
|
| 564 |
+
"ResolveComplicatePredicates": 0.0021598339080810547,
|
| 565 |
+
"RewriteReplicationMatmul": 0.0023620128631591797,
|
| 566 |
+
"RewriteWeights": 0.005594730377197266,
|
| 567 |
+
"SFKVectorizer": 0.6774003505706787,
|
| 568 |
+
"ShardingPropagationAnalysis": 0.07418251037597656,
|
| 569 |
+
"SimpleAllReduceTiling": 0.011443138122558594,
|
| 570 |
+
"Simplifier": 0.006997346878051758,
|
| 571 |
+
"SimplifyMacroPredicates": 0.010604381561279297,
|
| 572 |
+
"SimplifyNeuronTensor": 0.026854515075683594,
|
| 573 |
+
"SimplifySlice": 0.0022373199462890625,
|
| 574 |
+
"SimplifyTensor": 0.013662576675415039,
|
| 575 |
+
"SpillPSum": 0.04489874839782715,
|
| 576 |
+
"SplitAPUnionSets": 0.09562921524047852,
|
| 577 |
+
"SplitAccGrp": 0.0030364990234375,
|
| 578 |
+
"StaticProfiler": 0.02321648597717285,
|
| 579 |
+
"StaticTransposeLocalTensor": 0.004773139953613281,
|
| 580 |
+
"SundaISel": 0.08316183090209961,
|
| 581 |
+
"TCTransform": 0.0036308765411376953,
|
| 582 |
+
"TensorInitialization": 0.008217096328735352,
|
| 583 |
+
"TensorOpSimplifier": 0.013900995254516602,
|
| 584 |
+
"TensorOpTransform": 0.04661202430725098,
|
| 585 |
+
"TileCCOps": 0.03966546058654785,
|
| 586 |
+
"TilingProfiler": 0.02010059356689453,
|
| 587 |
+
"TransformConvOp": 0.00817561149597168,
|
| 588 |
+
"TritiumFusion": 0.12114953994750977,
|
| 589 |
+
"ValueNumbering": 0.01564812660217285,
|
| 590 |
+
"VectorizeDMA": 0.007418394088745117,
|
| 591 |
+
"VectorizeMatMult": 0.042043209075927734,
|
| 592 |
+
"WeightCoalescing": 0.008504390716552734,
|
| 593 |
+
"ZeroSizeTensorElimination": 0.0001614093780517578
|
| 594 |
+
},
|
| 595 |
+
"tensorizer": {
|
| 596 |
+
"DMATilingProfiler::TotalInstructionsAfterTiling": 6983.0,
|
| 597 |
+
"StaticProfiler::AifUb": 127.67816925048828,
|
| 598 |
+
"StaticProfiler::ArithmeticIntensityTensorizer": 265.79534912109375,
|
| 599 |
+
"StaticProfiler::AverageDmaLength": 2094.913818359375,
|
| 600 |
+
"StaticProfiler::AverageFractalPeUtilization": 99.83814239501953,
|
| 601 |
+
"StaticProfiler::AveragePartitionUtilization": 99.57943725585938,
|
| 602 |
+
"StaticProfiler::AveragePeUtilization": 99.35083770751953,
|
| 603 |
+
"StaticProfiler::DDRTransferBytes": 237259264.0,
|
| 604 |
+
"StaticProfiler::InternalTransferBytes": 225476608.0,
|
| 605 |
+
"StaticProfiler::LoadExpanded": 36391.0,
|
| 606 |
+
"StaticProfiler::LocalizationEfficiency": 208.176025390625,
|
| 607 |
+
"StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 340.6685791015625,
|
| 608 |
+
"StaticProfiler::StoreExpanded": 27137.0,
|
| 609 |
+
"StaticProfiler::TotalDMAExpanded": 63528.0,
|
| 610 |
+
"StaticProfiler::TotalDynamicInstancesCount": 10455.0,
|
| 611 |
+
"StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 10430.0,
|
| 612 |
+
"StaticProfiler::TotalLNCComm": 0.0,
|
| 613 |
+
"StaticProfiler::TotalLNCCommTransfer": 0.0,
|
| 614 |
+
"TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0,
|
| 615 |
+
"TilingProfiler::AveragePeUtilizationAfterTiling": 0.0,
|
| 616 |
+
"TilingProfiler::BatchnormInstructionsAfterTiling": 0.0,
|
| 617 |
+
"TilingProfiler::DmaInstructionsAfterTiling": 0.0,
|
| 618 |
+
"TilingProfiler::GenericInstructionsAfterTiling": 384.0,
|
| 619 |
+
"TilingProfiler::MatMultInstructionsAfterTiling": 3104.0,
|
| 620 |
+
"TilingProfiler::NumPfTransposes": 7.0,
|
| 621 |
+
"TilingProfiler::NumPfTransposesForIo": 0.0,
|
| 622 |
+
"TilingProfiler::NumPfTransposesForLocal": 3.0,
|
| 623 |
+
"TilingProfiler::NumPfTransposesForNonlocal": 4.0,
|
| 624 |
+
"TilingProfiler::PfTransposeInstructions": 1792.0,
|
| 625 |
+
"TilingProfiler::PfTransposeInstructionsForIo": 0.0,
|
| 626 |
+
"TilingProfiler::PfTransposeInstructionsForLocal": 512.0,
|
| 627 |
+
"TilingProfiler::PfTransposeInstructionsForNonlocal": 1280.0,
|
| 628 |
+
"TilingProfiler::ReduceInstructionsAfterTiling": 0.0,
|
| 629 |
+
"TilingProfiler::SimdInstructionsAfterTiling": 932.0,
|
| 630 |
+
"TilingProfiler::TotalInstructionsAfterTiling": 0.0,
|
| 631 |
+
"TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0,
|
| 632 |
+
"TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0,
|
| 633 |
+
"TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0,
|
| 634 |
+
"TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0,
|
| 635 |
+
"TransformConvOp::conv2d_column_packing": 0.0,
|
| 636 |
+
"TransformConvOp::conv2d_column_packing_1": 0.0,
|
| 637 |
+
"TransformConvOp::conv2d_column_packing_io10": 0.0,
|
| 638 |
+
"TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0
|
| 639 |
+
}
|
| 640 |
+
},
|
| 641 |
+
"sg0001": {
|
| 642 |
+
"compiletime": {
|
| 643 |
+
"AGOrderingAnalysisPass": 0.09063863754272461,
|
| 644 |
+
"AffinePredicateResolution": 0.0031011104583740234,
|
| 645 |
+
"AliasDependencyElimination": 0.000255584716796875,
|
| 646 |
+
"AliasDependencyInduction": 0.012615680694580078,
|
| 647 |
+
"AliasDependencyReset": 0.04242563247680664,
|
| 648 |
+
"BFComputeCutting": 0.005561113357543945,
|
| 649 |
+
"BirCodeGenLoop": 0.09979081153869629,
|
| 650 |
+
"CCOpFusion": 0.1346728801727295,
|
| 651 |
+
"CanonicalizeDAGForPGTiling": 0.012668848037719727,
|
| 652 |
+
"CanonicalizeIR": 0.005399465560913086,
|
| 653 |
+
"CoalesceCCOp": 0.007870197296142578,
|
| 654 |
+
"CommuteConcat": 0.002213716506958008,
|
| 655 |
+
"DMALocalityOpt": 0.008012056350708008,
|
| 656 |
+
"DMAProfiler": 0.017035484313964844,
|
| 657 |
+
"DMATilingProfiler": 0.014662027359008789,
|
| 658 |
+
"DataLocalityOpt": 0.35089898109436035,
|
| 659 |
+
"DataStreaming": 0.0234222412109375,
|
| 660 |
+
"DeConcat": 0.00548243522644043,
|
| 661 |
+
"DeadCodeElimination": 0.010943174362182617,
|
| 662 |
+
"DeadStoreElimination": 0.037809133529663086,
|
| 663 |
+
"DelinearIndices": 0.028621196746826172,
|
| 664 |
+
"Delinearization": 0.0106201171875,
|
| 665 |
+
"DelinearizeSPMD": 0.029047727584838867,
|
| 666 |
+
"DoNothing": 0.00011301040649414063,
|
| 667 |
+
"DramToDramTranspose": 0.01769733428955078,
|
| 668 |
+
"DumpGraphAndMetadata": 0.013274908065795898,
|
| 669 |
+
"EliminateDivs": 0.006105184555053711,
|
| 670 |
+
"ExpandBatchNorm": 0.0027565956115722656,
|
| 671 |
+
"ExpandISAMacro": 0.01057887077331543,
|
| 672 |
+
"FactorizeBlkDims": 0.06908917427062988,
|
| 673 |
+
"FactorizeThreadAxesInFreeDims": 0.00501704216003418,
|
| 674 |
+
"FlattenMacroLoop": 0.01100611686706543,
|
| 675 |
+
"GenericAccessSimplifier": 0.0046689510345458984,
|
| 676 |
+
"InferInitValue": 0.07929110527038574,
|
| 677 |
+
"InferIntrinsicOnCC": 0.03535032272338867,
|
| 678 |
+
"InferNeuronTensor": 0.07708048820495605,
|
| 679 |
+
"InferNonlocalTensors": 0.09707069396972656,
|
| 680 |
+
"InferPSumTensor": 0.0996854305267334,
|
| 681 |
+
"InferShardAxis": 0.6792669296264648,
|
| 682 |
+
"InferSharedMemLoc": 0.009181737899780273,
|
| 683 |
+
"InlineNativeKernels": 0.0036575794219970703,
|
| 684 |
+
"InsertCoreBarrier": 0.015471458435058594,
|
| 685 |
+
"InsertIOTransposes": 0.04584240913391113,
|
| 686 |
+
"InsertImplicitShardAxisBeforeISel": 0.008542537689208984,
|
| 687 |
+
"InsertLocalTransposes": 0.029177427291870117,
|
| 688 |
+
"InsertOffloadedTransposes": 0.01767134666442871,
|
| 689 |
+
"LICM": 0.007311820983886719,
|
| 690 |
+
"LateLegalizeInst": 0.021373271942138672,
|
| 691 |
+
"LateLegalizePostSplit": 0.013000011444091797,
|
| 692 |
+
"LateLowerReshapeOp": 0.002672910690307617,
|
| 693 |
+
"LateLowerTensorOp": 0.022157907485961914,
|
| 694 |
+
"LateNeuronInstComb": 0.038089752197265625,
|
| 695 |
+
"LayoutPreprocessing": 0.0897824764251709,
|
| 696 |
+
"LayoutPreprocessingAndAnalysis": 0.140883207321167,
|
| 697 |
+
"LayoutRequirementAnalysis": 0.011104106903076172,
|
| 698 |
+
"LegalizeCCOpLayout": 0.0038611888885498047,
|
| 699 |
+
"LegalizeOpLevelAlias": 0.005839109420776367,
|
| 700 |
+
"LegalizePartitionReduce": 0.0055887699127197266,
|
| 701 |
+
"LegalizeSundaAccess": 0.053086042404174805,
|
| 702 |
+
"LegalizeSundaMacro": 0.020623445510864258,
|
| 703 |
+
"LegalizeType": 0.009373188018798828,
|
| 704 |
+
"LocalLayoutOpt": 0.07568526268005371,
|
| 705 |
+
"LoopFusion": 0.03827691078186035,
|
| 706 |
+
"LoopSplitting": 0.0006964206695556641,
|
| 707 |
+
"LowerBroadcast": 0.0038139820098876953,
|
| 708 |
+
"LowerCCOpBlockAxis": 0.015240907669067383,
|
| 709 |
+
"LowerComplexBroadcast": 0.00460052490234375,
|
| 710 |
+
"LowerIntrinsics": 0.06653690338134766,
|
| 711 |
+
"LowerShardAxis": 0.034250497817993164,
|
| 712 |
+
"LowerTensorOp": 0.024506807327270508,
|
| 713 |
+
"LowerToSendRecv": 0.00830531120300293,
|
| 714 |
+
"LowerTranspose": 0.026538848876953125,
|
| 715 |
+
"MacroGeneration": 0.1462860107421875,
|
| 716 |
+
"MaskPropagation": 0.004972219467163086,
|
| 717 |
+
"MemcpyElimination": 0.17155957221984863,
|
| 718 |
+
"MutateDataType": 0.0026092529296875,
|
| 719 |
+
"NeuronAliasDependencyInduction": 0.0009496212005615234,
|
| 720 |
+
"NeuronAliasDependencyReset": 0.029055118560791016,
|
| 721 |
+
"NeuronInstComb": 0.010199785232543945,
|
| 722 |
+
"NeuronLICM": 0.02064967155456543,
|
| 723 |
+
"NeuronLoopFusion": 0.045073747634887695,
|
| 724 |
+
"NeuronLoopInterchange": 0.004991292953491211,
|
| 725 |
+
"NeuronSimplifier": 0.04068398475646973,
|
| 726 |
+
"NeuronSimplifyPredicates": 0.012614011764526367,
|
| 727 |
+
"NeuronValueNumbering": 0.008387327194213867,
|
| 728 |
+
"OptimizeAliasedCopyChain": 0.004460334777832031,
|
| 729 |
+
"OptimizeNKIKernels": 0.3194434642791748,
|
| 730 |
+
"PAGLayoutOpt": 0.48951292037963867,
|
| 731 |
+
"PComputeCutting": 0.014848470687866211,
|
| 732 |
+
"PGLayoutTilingPipeline": 2.5451276302337646,
|
| 733 |
+
"PGTiling": 0.5836856365203857,
|
| 734 |
+
"PadElimination": 0.000995635986328125,
|
| 735 |
+
"ParAxesAnnotation": 0.40463972091674805,
|
| 736 |
+
"PartialLoopFusion": 0.06643557548522949,
|
| 737 |
+
"PartialSimdFusion": 0.13411688804626465,
|
| 738 |
+
"PerfectLoopNest": 0.0027947425842285156,
|
| 739 |
+
"RecognizeOpIdiom": 0.01806020736694336,
|
| 740 |
+
"Recompute": 0.0004432201385498047,
|
| 741 |
+
"RelaxPredicates": 0.009535789489746094,
|
| 742 |
+
"Rematerialization": 0.008739471435546875,
|
| 743 |
+
"RemoveShardedPartitionAxes": 0.0267181396484375,
|
| 744 |
+
"ReshapeWeights": 0.0024602413177490234,
|
| 745 |
+
"ResolveAccessConflict": 0.00865793228149414,
|
| 746 |
+
"ResolveComplicatePredicates": 0.007423877716064453,
|
| 747 |
+
"RewriteReplicationMatmul": 0.003094196319580078,
|
| 748 |
+
"RewriteWeights": 0.008661746978759766,
|
| 749 |
+
"SFKVectorizer": 0.5552070140838623,
|
| 750 |
+
"ShardingPropagationAnalysis": 0.07864713668823242,
|
| 751 |
+
"SimpleAllReduceTiling": 0.009680747985839844,
|
| 752 |
+
"Simplifier": 0.010446548461914063,
|
| 753 |
+
"SimplifyMacroPredicates": 0.012853145599365234,
|
| 754 |
+
"SimplifyNeuronTensor": 0.025235891342163086,
|
| 755 |
+
"SimplifySlice": 0.001861572265625,
|
| 756 |
+
"SimplifyTensor": 0.017523765563964844,
|
| 757 |
+
"SpillPSum": 0.09313821792602539,
|
| 758 |
+
"SplitAPUnionSets": 0.07895660400390625,
|
| 759 |
+
"SplitAccGrp": 0.0044307708740234375,
|
| 760 |
+
"StaticProfiler": 0.014701604843139648,
|
| 761 |
+
"StaticTransposeLocalTensor": 0.008467674255371094,
|
| 762 |
+
"SundaISel": 0.07091832160949707,
|
| 763 |
+
"TCTransform": 0.0018222332000732422,
|
| 764 |
+
"TensorInitialization": 0.008383989334106445,
|
| 765 |
+
"TensorOpSimplifier": 0.013144254684448242,
|
| 766 |
+
"TensorOpTransform": 0.17133593559265137,
|
| 767 |
+
"TileCCOps": 0.018372297286987305,
|
| 768 |
+
"TilingProfiler": 0.022103309631347656,
|
| 769 |
+
"TransformConvOp": 0.00668644905090332,
|
| 770 |
+
"TritiumFusion": 0.25888824462890625,
|
| 771 |
+
"ValueNumbering": 0.00537419319152832,
|
| 772 |
+
"VectorizeDMA": 0.018125534057617188,
|
| 773 |
+
"VectorizeMatMult": 0.04329061508178711,
|
| 774 |
+
"WeightCoalescing": 0.006384849548339844,
|
| 775 |
+
"ZeroSizeTensorElimination": 0.00020265579223632813
|
| 776 |
+
},
|
| 777 |
+
"tensorizer": {
|
| 778 |
+
"DMATilingProfiler::TotalInstructionsAfterTiling": 16532.0,
|
| 779 |
+
"StaticProfiler::AifUb": 911.9026489257813,
|
| 780 |
+
"StaticProfiler::ArithmeticIntensityTensorizer": 525.61767578125,
|
| 781 |
+
"StaticProfiler::AverageDmaLength": 2890.18798828125,
|
| 782 |
+
"StaticProfiler::AverageFractalPeUtilization": 100.0,
|
| 783 |
+
"StaticProfiler::AveragePartitionUtilization": 99.69086456298828,
|
| 784 |
+
"StaticProfiler::AveragePeUtilization": 100.0,
|
| 785 |
+
"StaticProfiler::DDRTransferBytes": 440600576.0,
|
| 786 |
+
"StaticProfiler::InternalTransferBytes": 226492416.0,
|
| 787 |
+
"StaticProfiler::LoadExpanded": 92289.0,
|
| 788 |
+
"StaticProfiler::LocalizationEfficiency": 57.63966751098633,
|
| 789 |
+
"StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 87.69190979003906,
|
| 790 |
+
"StaticProfiler::StoreExpanded": 26625.0,
|
| 791 |
+
"StaticProfiler::TotalDMAExpanded": 118914.0,
|
| 792 |
+
"StaticProfiler::TotalDynamicInstancesCount": 22001.0,
|
| 793 |
+
"StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 22001.0,
|
| 794 |
+
"StaticProfiler::TotalLNCComm": 0.0,
|
| 795 |
+
"StaticProfiler::TotalLNCCommTransfer": 0.0,
|
| 796 |
+
"TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0,
|
| 797 |
+
"TilingProfiler::AveragePeUtilizationAfterTiling": 0.0,
|
| 798 |
+
"TilingProfiler::BatchnormInstructionsAfterTiling": 0.0,
|
| 799 |
+
"TilingProfiler::DmaInstructionsAfterTiling": 0.0,
|
| 800 |
+
"TilingProfiler::GenericInstructionsAfterTiling": 256.0,
|
| 801 |
+
"TilingProfiler::MatMultInstructionsAfterTiling": 12288.0,
|
| 802 |
+
"TilingProfiler::NumPfTransposes": 8.0,
|
| 803 |
+
"TilingProfiler::NumPfTransposesForIo": 3.0,
|
| 804 |
+
"TilingProfiler::NumPfTransposesForLocal": 3.0,
|
| 805 |
+
"TilingProfiler::NumPfTransposesForNonlocal": 2.0,
|
| 806 |
+
"TilingProfiler::PfTransposeInstructions": 1984.0,
|
| 807 |
+
"TilingProfiler::PfTransposeInstructionsForIo": 576.0,
|
| 808 |
+
"TilingProfiler::PfTransposeInstructionsForLocal": 384.0,
|
| 809 |
+
"TilingProfiler::PfTransposeInstructionsForNonlocal": 1024.0,
|
| 810 |
+
"TilingProfiler::ReduceInstructionsAfterTiling": 0.0,
|
| 811 |
+
"TilingProfiler::SimdInstructionsAfterTiling": 1188.0,
|
| 812 |
+
"TilingProfiler::TotalInstructionsAfterTiling": 0.0,
|
| 813 |
+
"TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0,
|
| 814 |
+
"TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0,
|
| 815 |
+
"TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0,
|
| 816 |
+
"TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0,
|
| 817 |
+
"TransformConvOp::conv2d_column_packing": 0.0,
|
| 818 |
+
"TransformConvOp::conv2d_column_packing_1": 0.0,
|
| 819 |
+
"TransformConvOp::conv2d_column_packing_io10": 0.0,
|
| 820 |
+
"TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0
|
| 821 |
+
}
|
| 822 |
+
},
|
| 823 |
+
"sg0002": {
|
| 824 |
+
"compiletime": {
|
| 825 |
+
"AGOrderingAnalysisPass": 0.03893709182739258,
|
| 826 |
+
"AffinePredicateResolution": 0.00975942611694336,
|
| 827 |
+
"AliasDependencyElimination": 0.00020766258239746094,
|
| 828 |
+
"AliasDependencyInduction": 0.014848947525024414,
|
| 829 |
+
"AliasDependencyReset": 0.0507814884185791,
|
| 830 |
+
"BFComputeCutting": 0.004155397415161133,
|
| 831 |
+
"BirCodeGenLoop": 0.384446382522583,
|
| 832 |
+
"CCOpFusion": 0.11220550537109375,
|
| 833 |
+
"CanonicalizeDAGForPGTiling": 0.013774633407592773,
|
| 834 |
+
"CanonicalizeIR": 0.002764463424682617,
|
| 835 |
+
"CoalesceCCOp": 0.003862142562866211,
|
| 836 |
+
"CommuteConcat": 0.0019075870513916016,
|
| 837 |
+
"DMALocalityOpt": 0.0027344226837158203,
|
| 838 |
+
"DMAProfiler": 0.009855031967163086,
|
| 839 |
+
"DMATilingProfiler": 0.007188081741333008,
|
| 840 |
+
"DataLocalityOpt": 0.15634822845458984,
|
| 841 |
+
"DataStreaming": 0.008760213851928711,
|
| 842 |
+
"DeConcat": 0.0020532608032226563,
|
| 843 |
+
"DeadCodeElimination": 0.002146482467651367,
|
| 844 |
+
"DeadStoreElimination": 0.024139404296875,
|
| 845 |
+
"DelinearIndices": 0.013254880905151367,
|
| 846 |
+
"Delinearization": 0.007935047149658203,
|
| 847 |
+
"DelinearizeSPMD": 0.023029565811157227,
|
| 848 |
+
"DoNothing": 0.0001049041748046875,
|
| 849 |
+
"DramToDramTranspose": 0.012213945388793945,
|
| 850 |
+
"DumpGraphAndMetadata": 0.03455543518066406,
|
| 851 |
+
"EliminateDivs": 0.01893448829650879,
|
| 852 |
+
"ExpandBatchNorm": 0.007169485092163086,
|
| 853 |
+
"ExpandISAMacro": 0.007604122161865234,
|
| 854 |
+
"FactorizeBlkDims": 0.023853540420532227,
|
| 855 |
+
"FactorizeThreadAxesInFreeDims": 0.0075495243072509766,
|
| 856 |
+
"FlattenMacroLoop": 0.007609844207763672,
|
| 857 |
+
"GenericAccessSimplifier": 0.0013933181762695313,
|
| 858 |
+
"InferInitValue": 0.10064125061035156,
|
| 859 |
+
"InferIntrinsicOnCC": 0.026311159133911133,
|
| 860 |
+
"InferNeuronTensor": 0.05008339881896973,
|
| 861 |
+
"InferNonlocalTensors": 0.05733203887939453,
|
| 862 |
+
"InferPSumTensor": 0.0887153148651123,
|
| 863 |
+
"InferShardAxis": 0.6304898262023926,
|
| 864 |
+
"InferSharedMemLoc": 0.03429460525512695,
|
| 865 |
+
"InlineNativeKernels": 0.00394749641418457,
|
| 866 |
+
"InsertCoreBarrier": 0.009274959564208984,
|
| 867 |
+
"InsertIOTransposes": 0.04183030128479004,
|
| 868 |
+
"InsertImplicitShardAxisBeforeISel": 0.01711416244506836,
|
| 869 |
+
"InsertLocalTransposes": 0.0077512264251708984,
|
| 870 |
+
"InsertOffloadedTransposes": 0.010181665420532227,
|
| 871 |
+
"LICM": 0.005186319351196289,
|
| 872 |
+
"LateLegalizeInst": 0.015667200088500977,
|
| 873 |
+
"LateLegalizePostSplit": 0.03845643997192383,
|
| 874 |
+
"LateLowerReshapeOp": 0.0019919872283935547,
|
| 875 |
+
"LateLowerTensorOp": 0.0022301673889160156,
|
| 876 |
+
"LateNeuronInstComb": 0.018993377685546875,
|
| 877 |
+
"LayoutPreprocessing": 0.05747699737548828,
|
| 878 |
+
"LayoutPreprocessingAndAnalysis": 0.09093403816223145,
|
| 879 |
+
"LayoutRequirementAnalysis": 0.010792970657348633,
|
| 880 |
+
"LegalizeCCOpLayout": 0.0032892227172851563,
|
| 881 |
+
"LegalizeOpLevelAlias": 0.0013661384582519531,
|
| 882 |
+
"LegalizePartitionReduce": 0.006167411804199219,
|
| 883 |
+
"LegalizeSundaAccess": 0.03937268257141113,
|
| 884 |
+
"LegalizeSundaMacro": 0.051756858825683594,
|
| 885 |
+
"LegalizeType": 0.023316621780395508,
|
| 886 |
+
"LocalLayoutOpt": 0.021276235580444336,
|
| 887 |
+
"LoopFusion": 0.006464719772338867,
|
| 888 |
+
"LoopSplitting": 0.0007054805755615234,
|
| 889 |
+
"LowerBroadcast": 0.011565208435058594,
|
| 890 |
+
"LowerCCOpBlockAxis": 0.008892297744750977,
|
| 891 |
+
"LowerComplexBroadcast": 0.0035398006439208984,
|
| 892 |
+
"LowerIntrinsics": 0.04290151596069336,
|
| 893 |
+
"LowerShardAxis": 0.04483389854431152,
|
| 894 |
+
"LowerTensorOp": 0.025528907775878906,
|
| 895 |
+
"LowerToSendRecv": 0.04537153244018555,
|
| 896 |
+
"LowerTranspose": 0.024749279022216797,
|
| 897 |
+
"MacroGeneration": 0.08503556251525879,
|
| 898 |
+
"MaskPropagation": 0.007714748382568359,
|
| 899 |
+
"MemcpyElimination": 0.062020301818847656,
|
| 900 |
+
"MutateDataType": 0.0020122528076171875,
|
| 901 |
+
"NeuronAliasDependencyInduction": 0.0006520748138427734,
|
| 902 |
+
"NeuronAliasDependencyReset": 0.10503625869750977,
|
| 903 |
+
"NeuronInstComb": 0.026773691177368164,
|
| 904 |
+
"NeuronLICM": 0.03244495391845703,
|
| 905 |
+
"NeuronLoopFusion": 0.05422854423522949,
|
| 906 |
+
"NeuronLoopInterchange": 0.0029349327087402344,
|
| 907 |
+
"NeuronSimplifier": 0.026484966278076172,
|
| 908 |
+
"NeuronSimplifyPredicates": 0.02537679672241211,
|
| 909 |
+
"NeuronValueNumbering": 0.005478858947753906,
|
| 910 |
+
"OptimizeAliasedCopyChain": 0.0018880367279052734,
|
| 911 |
+
"OptimizeNKIKernels": 4.115047454833984,
|
| 912 |
+
"PAGLayoutOpt": 0.11529350280761719,
|
| 913 |
+
"PComputeCutting": 0.010918140411376953,
|
| 914 |
+
"PGLayoutTilingPipeline": 1.6512439250946045,
|
| 915 |
+
"PGTiling": 0.2841973304748535,
|
| 916 |
+
"PadElimination": 0.0008590221405029297,
|
| 917 |
+
"ParAxesAnnotation": 0.07899093627929688,
|
| 918 |
+
"PartialLoopFusion": 0.03534102439880371,
|
| 919 |
+
"PartialSimdFusion": 0.021408557891845703,
|
| 920 |
+
"PerfectLoopNest": 0.008621454238891602,
|
| 921 |
+
"RecognizeOpIdiom": 0.010253190994262695,
|
| 922 |
+
"Recompute": 0.0005791187286376953,
|
| 923 |
+
"RelaxPredicates": 0.013797521591186523,
|
| 924 |
+
"Rematerialization": 0.0054569244384765625,
|
| 925 |
+
"RemoveShardedPartitionAxes": 0.03261446952819824,
|
| 926 |
+
"ReshapeWeights": 0.001524209976196289,
|
| 927 |
+
"ResolveAccessConflict": 0.019870281219482422,
|
| 928 |
+
"ResolveComplicatePredicates": 0.0053920745849609375,
|
| 929 |
+
"RewriteReplicationMatmul": 0.0025107860565185547,
|
| 930 |
+
"RewriteWeights": 0.009802579879760742,
|
| 931 |
+
"SFKVectorizer": 0.240997314453125,
|
| 932 |
+
"ShardingPropagationAnalysis": 0.10757136344909668,
|
| 933 |
+
"SimpleAllReduceTiling": 0.0035986900329589844,
|
| 934 |
+
"Simplifier": 0.005366325378417969,
|
| 935 |
+
"SimplifyMacroPredicates": 0.016243934631347656,
|
| 936 |
+
"SimplifyNeuronTensor": 0.016655683517456055,
|
| 937 |
+
"SimplifySlice": 0.002231597900390625,
|
| 938 |
+
"SimplifyTensor": 0.017529726028442383,
|
| 939 |
+
"SpillPSum": 0.03337574005126953,
|
| 940 |
+
"SplitAPUnionSets": 0.15779972076416016,
|
| 941 |
+
"SplitAccGrp": 0.005539894104003906,
|
| 942 |
+
"StaticProfiler": 0.046514272689819336,
|
| 943 |
+
"StaticTransposeLocalTensor": 0.008464574813842773,
|
| 944 |
+
"SundaISel": 0.07130837440490723,
|
| 945 |
+
"TCTransform": 0.002462148666381836,
|
| 946 |
+
"TensorInitialization": 0.011480093002319336,
|
| 947 |
+
"TensorOpSimplifier": 0.008947134017944336,
|
| 948 |
+
"TensorOpTransform": 0.06947088241577148,
|
| 949 |
+
"TileCCOps": 0.012774467468261719,
|
| 950 |
+
"TilingProfiler": 0.014863967895507813,
|
| 951 |
+
"TransformConvOp": 0.006424665451049805,
|
| 952 |
+
"TritiumFusion": 0.11082077026367188,
|
| 953 |
+
"ValueNumbering": 0.0049648284912109375,
|
| 954 |
+
"VectorizeDMA": 0.004624843597412109,
|
| 955 |
+
"VectorizeMatMult": 0.028928518295288086,
|
| 956 |
+
"WeightCoalescing": 0.003192901611328125,
|
| 957 |
+
"ZeroSizeTensorElimination": 0.00021529197692871094
|
| 958 |
+
},
|
| 959 |
+
"tensorizer": {
|
| 960 |
+
"DMATilingProfiler::TotalInstructionsAfterTiling": 31232.0,
|
| 961 |
+
"StaticProfiler::AifUb": 538.6357421875,
|
| 962 |
+
"StaticProfiler::ArithmeticIntensityTensorizer": 306.53076171875,
|
| 963 |
+
"StaticProfiler::AverageDmaLength": 2517.368896484375,
|
| 964 |
+
"StaticProfiler::AverageFractalPeUtilization": 99.12728881835938,
|
| 965 |
+
"StaticProfiler::AveragePartitionUtilization": 95.96998596191406,
|
| 966 |
+
"StaticProfiler::AveragePeUtilization": 97.68225860595703,
|
| 967 |
+
"StaticProfiler::DDRTransferBytes": 672177216.0,
|
| 968 |
+
"StaticProfiler::InternalTransferBytes": 407820064.0,
|
| 969 |
+
"StaticProfiler::LoadExpanded": 189029.0,
|
| 970 |
+
"StaticProfiler::LocalizationEfficiency": 56.908729553222656,
|
| 971 |
+
"StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 63.73067855834961,
|
| 972 |
+
"StaticProfiler::StoreExpanded": 13673.0,
|
| 973 |
+
"StaticProfiler::TotalDMAExpanded": 202702.0,
|
| 974 |
+
"StaticProfiler::TotalDynamicInstancesCount": 37700.0,
|
| 975 |
+
"StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 37249.0,
|
| 976 |
+
"StaticProfiler::TotalLNCComm": 0.0,
|
| 977 |
+
"StaticProfiler::TotalLNCCommTransfer": 0.0,
|
| 978 |
+
"TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0,
|
| 979 |
+
"TilingProfiler::AveragePeUtilizationAfterTiling": 0.0,
|
| 980 |
+
"TilingProfiler::BatchnormInstructionsAfterTiling": 0.0,
|
| 981 |
+
"TilingProfiler::DmaInstructionsAfterTiling": 0.0,
|
| 982 |
+
"TilingProfiler::GenericInstructionsAfterTiling": 4.0,
|
| 983 |
+
"TilingProfiler::MatMultInstructionsAfterTiling": 18720.0,
|
| 984 |
+
"TilingProfiler::NumPfTransposes": 5.0,
|
| 985 |
+
"TilingProfiler::NumPfTransposesForIo": 1.0,
|
| 986 |
+
"TilingProfiler::NumPfTransposesForLocal": 1.0,
|
| 987 |
+
"TilingProfiler::NumPfTransposesForNonlocal": 3.0,
|
| 988 |
+
"TilingProfiler::PfTransposeInstructions": 11041.0,
|
| 989 |
+
"TilingProfiler::PfTransposeInstructionsForIo": 9504.0,
|
| 990 |
+
"TilingProfiler::PfTransposeInstructionsForLocal": 1.0,
|
| 991 |
+
"TilingProfiler::PfTransposeInstructionsForNonlocal": 1536.0,
|
| 992 |
+
"TilingProfiler::ReduceInstructionsAfterTiling": 18.0,
|
| 993 |
+
"TilingProfiler::SimdInstructionsAfterTiling": 604.0,
|
| 994 |
+
"TilingProfiler::TotalInstructionsAfterTiling": 0.0,
|
| 995 |
+
"TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0,
|
| 996 |
+
"TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0,
|
| 997 |
+
"TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0,
|
| 998 |
+
"TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0,
|
| 999 |
+
"TransformConvOp::conv2d_column_packing": 0.0,
|
| 1000 |
+
"TransformConvOp::conv2d_column_packing_1": 0.0,
|
| 1001 |
+
"TransformConvOp::conv2d_column_packing_io10": 0.0,
|
| 1002 |
+
"TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0
|
| 1003 |
+
}
|
| 1004 |
+
},
|
| 1005 |
+
"sg01": {
|
| 1006 |
+
"compiletime": {
|
| 1007 |
+
"CanonicalizeConv": 6.000000212225132e-06,
|
| 1008 |
+
"CanonicalizeForTensorizer": 1.2000000424450263e-05,
|
| 1009 |
+
"Canonicalizer": 0.00033099998836405575,
|
| 1010 |
+
"HoistCompute": 1.9999999949504854e-06,
|
| 1011 |
+
"IdentifyCrossPassTensors": 1.4000000192027073e-05,
|
| 1012 |
+
"MemcastMotion": 6.000000212225132e-06,
|
| 1013 |
+
"PenguinizeFunctions": 1.2000000424450263e-05,
|
| 1014 |
+
"PruneFunctions": 1.2999999853491317e-05,
|
| 1015 |
+
"RemoveOptimizationBarriers": 1.8999999156221747e-05,
|
| 1016 |
+
"ScatterMotion": 1.4000000192027073e-05,
|
| 1017 |
+
"TensorizerLegalizationPass": 1.700000029813964e-05,
|
| 1018 |
+
"VerifySupportedOps": 1.4999999621068127e-05,
|
| 1019 |
+
"algsimp": 5.6000000768108293e-05,
|
| 1020 |
+
"batchnorm_expander": 1.2000000424450263e-05,
|
| 1021 |
+
"boundary-marker-removal": 3.999999989900971e-06,
|
| 1022 |
+
"call-inliner": 1.1000000085914508e-05,
|
| 1023 |
+
"canonicalize-boundary-marker": 4.999999873689376e-06,
|
| 1024 |
+
"collective-stream-id-checker": 3.000000106112566e-06,
|
| 1025 |
+
"comparison-expander": 4.999999873689376e-06,
|
| 1026 |
+
"computation-deduplicator": 1.9999999494757503e-05,
|
| 1027 |
+
"config-lowering": 3.600000127335079e-05,
|
| 1028 |
+
"constant_folding": 9.000000318337698e-06,
|
| 1029 |
+
"cse": 1.2000000424450263e-05,
|
| 1030 |
+
"dce": 9.999999974752427e-07,
|
| 1031 |
+
"dynamic-slice-transpose": 3.999999989900971e-06,
|
| 1032 |
+
"eliminate-redundant-compare": 3.999999989900971e-06,
|
| 1033 |
+
"emit-offloaded-dropout": 1.4000000192027073e-05,
|
| 1034 |
+
"flatten-call-graph": 6.000000212225132e-06,
|
| 1035 |
+
"fuse-send-recv": 2.2000000171829015e-05,
|
| 1036 |
+
"hilo-conditional-to-select": 4.999999873689376e-06,
|
| 1037 |
+
"hilo::LegalizeAlias": 3.999999989900971e-06,
|
| 1038 |
+
"hilo::NeuronInstCombine": 4.70000013592653e-05,
|
| 1039 |
+
"hilo::NeuronOpFusion": 7.999999979801942e-06,
|
| 1040 |
+
"hilo::ReplaceTokenTypeWithU8Pass": 1.4999999621068127e-05,
|
| 1041 |
+
"hilo::ScheduleFusion": 9.999999974752427e-07,
|
| 1042 |
+
"hilo::SixtyFourHack": 9.999999747378752e-06,
|
| 1043 |
+
"hilo::VerifyAliasing": 1.9999999949504854e-06,
|
| 1044 |
+
"hlo-mac-count": 8.800000068731606e-05,
|
| 1045 |
+
"legalize-ccops-for-tensorizer": 9.999999974752427e-07,
|
| 1046 |
+
"legalize-compare": 3.999999989900971e-06,
|
| 1047 |
+
"lower-argminmax-custom-call": 3.999999989900971e-06,
|
| 1048 |
+
"map-inline": 9.999999747378752e-06,
|
| 1049 |
+
"metadata-naming": 1.700000029813964e-05,
|
| 1050 |
+
"mlir::detail::OpToOpPassAdaptor": 1.700000029813964e-05,
|
| 1051 |
+
"mlir::hlo::MhloToPyPenguin": 0.0024689999409019947,
|
| 1052 |
+
"mlir::mhlo::LowerComplexExtraPass": 0.00012599999899975955,
|
| 1053 |
+
"mlir::mhlo::LowerComplexPass": 0.0001630000042496249,
|
| 1054 |
+
"native-to-custom-softmax": 4.999999873689376e-06,
|
| 1055 |
+
"native-to-custom-softmax-dx": 1.2999999853491317e-05,
|
| 1056 |
+
"neuron-hlo-verifier": 0.00035600000410340726,
|
| 1057 |
+
"operand_upcaster": 2.5999999706982635e-05,
|
| 1058 |
+
"post-par-pipe-begin": 9.999999974752427e-07,
|
| 1059 |
+
"post-par-pipe-end": 0.0,
|
| 1060 |
+
"post-partition-simplification": 0.0005549999768845737,
|
| 1061 |
+
"replace-minimum-constant": 6.000000212225132e-06,
|
| 1062 |
+
"reshape-mover": 3.000000106112566e-06,
|
| 1063 |
+
"simplify-concat": 4.199999966658652e-05,
|
| 1064 |
+
"simplify-while-loops": 1.9999999949504854e-06,
|
| 1065 |
+
"transform-variadic-reduce": 7.000000096013537e-06,
|
| 1066 |
+
"tuple-simplifier": 4.999999873689376e-06,
|
| 1067 |
+
"unpack-nested-aws-ntwsr": 3.000000106112566e-06,
|
| 1068 |
+
"unroll-while-loop": 0.0
|
| 1069 |
+
},
|
| 1070 |
+
"hilo": {
|
| 1071 |
+
"ArithmeticIntensity": 1091.5736083984375,
|
| 1072 |
+
"HloMacCount": 120259084288.0,
|
| 1073 |
+
"Traffic": 220340768.0
|
| 1074 |
+
}
|
| 1075 |
+
},
|
| 1076 |
+
"sg02": {
|
| 1077 |
+
"compiletime": {
|
| 1078 |
+
"CanonicalizeConv": 0.0,
|
| 1079 |
+
"CanonicalizeForTensorizer": 1.9999999494757503e-05,
|
| 1080 |
+
"Canonicalizer": 0.0002680000034160912,
|
| 1081 |
+
"HoistCompute": 0.0,
|
| 1082 |
+
"IdentifyCrossPassTensors": 1.1000000085914508e-05,
|
| 1083 |
+
"MemcastMotion": 6.000000212225132e-06,
|
| 1084 |
+
"PenguinizeFunctions": 1.700000029813964e-05,
|
| 1085 |
+
"PruneFunctions": 3.099999958067201e-05,
|
| 1086 |
+
"RemoveOptimizationBarriers": 9.000000318337698e-06,
|
| 1087 |
+
"ScatterMotion": 7.999999979801942e-06,
|
| 1088 |
+
"TensorizerLegalizationPass": 1.1000000085914508e-05,
|
| 1089 |
+
"VerifySupportedOps": 1.2000000424450263e-05,
|
| 1090 |
+
"algsimp": 4.8000001697801054e-05,
|
| 1091 |
+
"batchnorm_expander": 1.1000000085914508e-05,
|
| 1092 |
+
"boundary-marker-removal": 3.000000106112566e-06,
|
| 1093 |
+
"call-inliner": 9.999999747378752e-06,
|
| 1094 |
+
"canonicalize-boundary-marker": 3.999999989900971e-06,
|
| 1095 |
+
"collective-stream-id-checker": 3.000000106112566e-06,
|
| 1096 |
+
"comparison-expander": 4.999999873689376e-06,
|
| 1097 |
+
"computation-deduplicator": 1.8999999156221747e-05,
|
| 1098 |
+
"config-lowering": 3.9999998989515007e-05,
|
| 1099 |
+
"constant_folding": 7.999999979801942e-06,
|
| 1100 |
+
"cse": 1.2000000424450263e-05,
|
| 1101 |
+
"dce": 9.999999974752427e-07,
|
| 1102 |
+
"dynamic-slice-transpose": 3.000000106112566e-06,
|
| 1103 |
+
"eliminate-redundant-compare": 3.000000106112566e-06,
|
| 1104 |
+
"emit-offloaded-dropout": 9.999999747378752e-06,
|
| 1105 |
+
"flatten-call-graph": 9.000000318337698e-06,
|
| 1106 |
+
"fuse-send-recv": 1.5999999959603883e-05,
|
| 1107 |
+
"hilo-conditional-to-select": 4.999999873689376e-06,
|
| 1108 |
+
"hilo::LegalizeAlias": 3.000000106112566e-06,
|
| 1109 |
+
"hilo::NeuronInstCombine": 2.4000000848900527e-05,
|
| 1110 |
+
"hilo::NeuronOpFusion": 1.9999999949504854e-06,
|
| 1111 |
+
"hilo::ReplaceTokenTypeWithU8Pass": 7.000000096013537e-06,
|
| 1112 |
+
"hilo::ScheduleFusion": 1.9999999949504854e-06,
|
| 1113 |
+
"hilo::SixtyFourHack": 7.300000288523734e-05,
|
| 1114 |
+
"hilo::VerifyAliasing": 9.999999974752427e-07,
|
| 1115 |
+
"hlo-mac-count": 0.005158000160008669,
|
| 1116 |
+
"legalize-ccops-for-tensorizer": 9.999999974752427e-07,
|
| 1117 |
+
"legalize-compare": 3.000000106112566e-06,
|
| 1118 |
+
"lower-argminmax-custom-call": 3.000000106112566e-06,
|
| 1119 |
+
"map-inline": 1.1000000085914508e-05,
|
| 1120 |
+
"metadata-naming": 1.4000000192027073e-05,
|
| 1121 |
+
"mlir::detail::OpToOpPassAdaptor": 2.099999983329326e-05,
|
| 1122 |
+
"mlir::hlo::MhloToPyPenguin": 0.007327999919652939,
|
| 1123 |
+
"mlir::mhlo::LowerComplexExtraPass": 7.899999764049426e-05,
|
| 1124 |
+
"mlir::mhlo::LowerComplexPass": 4.70000013592653e-05,
|
| 1125 |
+
"native-to-custom-softmax": 4.999999873689376e-06,
|
| 1126 |
+
"native-to-custom-softmax-dx": 1.700000029813964e-05,
|
| 1127 |
+
"neuron-hlo-verifier": 0.0003319999959785491,
|
| 1128 |
+
"operand_upcaster": 1.2999999853491317e-05,
|
| 1129 |
+
"post-par-pipe-begin": 9.999999974752427e-07,
|
| 1130 |
+
"post-par-pipe-end": 0.0,
|
| 1131 |
+
"post-partition-simplification": 0.00046999999904073775,
|
| 1132 |
+
"replace-minimum-constant": 7.000000096013537e-06,
|
| 1133 |
+
"reshape-mover": 1.9999999949504854e-06,
|
| 1134 |
+
"simplify-concat": 3.199999991920777e-05,
|
| 1135 |
+
"simplify-while-loops": 1.9999999949504854e-06,
|
| 1136 |
+
"transform-variadic-reduce": 4.5000000682193786e-05,
|
| 1137 |
+
"tuple-simplifier": 3.999999989900971e-06,
|
| 1138 |
+
"unpack-nested-aws-ntwsr": 3.999999989900971e-06,
|
| 1139 |
+
"unroll-while-loop": 0.0
|
| 1140 |
+
},
|
| 1141 |
+
"hilo": {
|
| 1142 |
+
"ArithmeticIntensity": 387.7274169921875,
|
| 1143 |
+
"HloMacCount": 77466042368.0,
|
| 1144 |
+
"Traffic": 399590208.0
|
| 1145 |
+
}
|
| 1146 |
+
},
|
| 1147 |
+
"topk": {
|
| 1148 |
+
"compiletime": {
|
| 1149 |
+
"CoalesceCCOp": 0.014192342758178711,
|
| 1150 |
+
"DMALocalityOpt": 0.00689697265625,
|
| 1151 |
+
"DMAProfiler": 0.01308584213256836,
|
| 1152 |
+
"DataStreaming": 0.022514820098876953,
|
| 1153 |
+
"DoNothing": 0.0002422332763671875,
|
| 1154 |
+
"ExpandISAMacro": 0.01119232177734375,
|
| 1155 |
+
"FactorizeBlkDims": 0.05026698112487793,
|
| 1156 |
+
"InferPSumTensor": 0.032309532165527344,
|
| 1157 |
+
"InferSharedMemLoc": 0.008169412612915039,
|
| 1158 |
+
"InsertCoreBarrier": 0.008690834045410156,
|
| 1159 |
+
"LateLegalizeInst": 0.02731013298034668,
|
| 1160 |
+
"LateNeuronInstComb": 0.029446125030517578,
|
| 1161 |
+
"LegalizeSundaAccess": 0.05955004692077637,
|
| 1162 |
+
"LegalizeType": 0.04967856407165527,
|
| 1163 |
+
"LowerBroadcast": 0.0077512264251708984,
|
| 1164 |
+
"LowerIntrinsics": 0.007628440856933594,
|
| 1165 |
+
"LowerTranspose": 0.015612125396728516,
|
| 1166 |
+
"NeuronInstComb": 0.029858112335205078,
|
| 1167 |
+
"NeuronLICM": 0.02176380157470703,
|
| 1168 |
+
"NeuronSimplifyPredicates": 0.008015632629394531,
|
| 1169 |
+
"NeuronValueNumbering": 0.015556097030639648,
|
| 1170 |
+
"SFKVectorizer": 0.10406112670898438,
|
| 1171 |
+
"SimpleAllReduceTiling": 0.01195383071899414,
|
| 1172 |
+
"SimplifyNeuronTensor": 0.15082645416259766,
|
| 1173 |
+
"SpillPSum": 0.17061901092529297,
|
| 1174 |
+
"WeightCoalescing": 0.009498357772827148
|
| 1175 |
+
}
|
| 1176 |
+
}
|
| 1177 |
+
}
|
context_encoding_model/_tp0_bk5/graph.neff
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:56e28f3613a7ada8c1d580c4a0d3979da6436bd82072a724c52018668343c286
|
| 3 |
+
size 3062784
|